/*************************************************************************
 * The contents of this file are subject to the MYRICOM MYRINET          *
 * EXPRESS (MX) NETWORKING SOFTWARE AND DOCUMENTATION LICENSE (the       *
 * "License"); User may not use this file except in compliance with the  *
 * License.  The full text of the License can found in LICENSE.TXT       *
 *                                                                       *
 * Software distributed under the License is distributed on an "AS IS"   *
 * basis, WITHOUT WARRANTY OF ANY KIND, either express or implied.  See  *
 * the License for the specific language governing rights and            *
 * limitations under the License.                                        *
 *                                                                       *
 * Copyright 2003 - 2004 by Myricom, Inc.  All rights reserved.          *
 *************************************************************************/

/* modifications for MX kernel lib made by
 * Brice.Goglin@ens-lyon.org (LIP/INRIA/ENS-Lyon) */

static const char __idstring[] = "@(#)$Id: mx_common.c,v 1.349.2.6 2006/11/10 07:13:07 loic Exp $";

#include "mx_arch.h"
#include "mx_misc.h"
#include "mx_instance.h"
#include "mx_malloc.h"
#include "mx_pio.h"
#include "mx_peer.h"
#include "mx_stbar.h"
#include "mx_util.h"
#include "mx_kraw.h"
#include "mx_version.h"
#include "mx_cpu.h"

#ifndef MX_DISABLE_COMMON_COPYBLOCK
#define MX_DISABLE_COMMON_COPYBLOCK 0
#endif

#ifndef MX_DMA_VPAGE_SIZE 
#define MX_DMA_VPAGE_SIZE MX_VPAGE_SIZE
#define MX_ALLOC_DMA_PAGE(is, alloc_addr, addr, pin, len) mx_alloc_dma_page(is, alloc_addr, addr, pin)
#define MX_FREE_DMA_PAGE(is, alloc_addr, pin, len) mx_free_dma_page(is, alloc_addr, pin)
#endif

int mx_initialized = 0;
uint32_t mx_small_message_threshold = 128;
uint32_t mx_medium_message_threshold = 32*1024;
uint32_t mx_security_disabled = 0;
mx_spinlock_t mx_lanai_print_spinlock;

mx_sync_t mx_global_mutex;

int
mx_init_driver(void)
{
  int status;
#if MX_CPU_x86 && MX_ENABLE_SSE2 && !defined _MSC_VER
  if (!mx__cpu_has_sse2()) {
    MX_WARN(("Processor without sse2: recompile with --disable-sse2.\n"));
    return EIO;
  }
#endif
  MX_INFO(("Version %s\n", MX_VERSION_STR));
  MX_INFO(("Build %s\n", MX_BUILD_STR));
  MX_INFO(("Debug %s\n", MX_DEBUG ? "ON" : "OFF"));
  mx_set_default_hostname();

  mx_lx_init_board_ops();
  mx_lz_init_board_ops();

  MX_DEBUG_PRINT 
    (MX_DEBUG_BOARD_INIT,
     ("MX configured for %d instances\n", mx_max_instance));
  mx_instances = mx_kmalloc 
    (sizeof(mx_instances[0]) * mx_max_instance, MX_MZERO);
  if (mx_instances == 0) {
    return ENOMEM;
  }

  mx_mcpi.globals = 
    mx_kmalloc(mx_mcpi.memory_size * mx_max_instance, MX_MZERO);

  if (mx_mcpi.globals == 0) {
    mx_kfree(mx_instances);
    return ENOMEM;
  }
  status = mx_init_peers();
  if (status) {
    mx_kfree(mx_mcpi.globals);
    mx_kfree(mx_instances);
    return ENOMEM;
  }
  mx_sync_init(&mx_global_mutex, NULL, -1, "mx_global_mutex");
  mx_spin_lock_init(&mx_lanai_print_spinlock, NULL, -1, "lanai print spinlock");
  mx_initialized = 1;

  if (mx_security_disabled) {
    MX_WARN(("Security is disabled\n"));
  }

  return 0;
}

int
mx_finalize_driver(void)
{
  if (!mx_initialized) {
    return -1;
  }
  mx_kfree(mx_instances);
  mx_instances = 0;
  mx_kfree(mx_mcpi.globals);
  mx_mcpi.globals = 0;
  mx_sync_destroy(&mx_global_mutex);
  mx_spin_lock_destroy(&mx_lanai_print_spinlock);
  mx_destroy_peers();
  return 0;
}

#if !MX_OPTIMIZED_DMA_PAGE_ALLOC

int
mx_alloc_dma_page(mx_instance_state_t *is, char **alloc_addr, 
		  char **addr, mx_page_pin_t *pin)
{
  char *kalloc_addr;
  char *kbuf;
  int status;

  kalloc_addr = mx_kmalloc(2*PAGE_SIZE, MX_WAITOK|MX_MZERO);
  if (!kalloc_addr)
    goto abort;
  kbuf = (char *)(uintptr_t)MX_TRUNC_PAGE((uint64_t)(uintptr_t)kalloc_addr) + MX_PAGE_SIZE;
  pin->va = (uint64_t)(uintptr_t)kbuf;
  status = mx_pin_page(is, pin, MX_PIN_KERNEL | MX_PIN_CONSISTENT, 0);
  if (status)
    goto abort;
  
  *alloc_addr = kalloc_addr;
  *addr = kbuf;
  return 0;

 abort:
  if (kalloc_addr)
    mx_kfree(kalloc_addr);
  return ENOMEM;
}

void
mx_free_dma_page(mx_instance_state_t *is, char **alloc_addr, mx_page_pin_t *pin)
{
  mx_unpin_page(is, pin, MX_PIN_KERNEL | MX_PIN_CONSISTENT);
  mx_kfree(*alloc_addr);
  if (MX_DEBUG)
    *alloc_addr = 0;
}
#endif /* MX_OPTIMIZED_DMA_PAGE_ALLOC*/

int
mx_alloc_zeroed_dma_page(mx_instance_state_t *is, char **alloc_addr, 
			 char **addr, mx_page_pin_t *pin)
{
  int status;

  status = mx_alloc_dma_page(is, alloc_addr, addr, pin);
  if (!status)
    bzero(*addr, PAGE_SIZE);
  return status;
}

mx_instance_state_t *
mx_get_instance(uint32_t unit)
{
  mx_instance_state_t *is;

  /* make sure the requested unit is in range  */
  if (unit >= mx_max_instance) {
    return NULL;
  }

  /* grab the global mutex to make sure the global array of instances
     is consistant */
  mx_mutex_enter(&mx_global_mutex);
  is = mx_instances[unit];

  /* make sure its a valid instance */
  if (is == 0) {
    mx_mutex_exit(&mx_global_mutex);      
    return NULL;
  }
  
  mx_atomic_add(1, &is->ref_count);
  mx_mutex_exit(&mx_global_mutex);
  return is;
}

void
mx_release_instance(mx_instance_state_t *is)
{
  mx_atomic_subtract(1, &is->ref_count);
}

static int
mx_get_logging(mx_instance_state_t *is, uint32_t size, mx_uaddr_t out,
	       uint32_t is_kernel)
{
  char *sram_ptr, *bounce;
  int status = 0;
  uint32_t logging_offset, dont_care, copy_size;
  mx_sync_t tmp_sync;

  /* sanity check */
  if (mx_is_dead(is)) {
    return (ENXIO);
  }
  
  /* sanity check */
  if (size == 0) {
    return (0);
  }
  
  /* get the SRAM offset of the logging data */
  status = mx_mcpi.get_param(is->id, is->lanai.sram, "logging_offset", 
			     &logging_offset);
  if (status) {
    MX_WARN (("Don't know where are the logging buffer in the SRAM\n"));
    return (ENXIO);
  }

  /* sanity check */
  if (logging_offset == 0) {
    MX_WARN (("Logging is not enabled in the MCP build\n"));
    return (ENXIO);
  }
  
  /* the MCP is now logging */
  mx_mutex_enter(&is->sync);
  if (is->flags & MX_IS_LOGGING) {
    mx_mutex_exit(&is->sync);
    return EBUSY;
  }
  is->flags |= MX_IS_LOGGING;
  mx_mutex_exit(&is->sync);
  
  sram_ptr = (char *) is->lanai.sram + (size_t) logging_offset;
  mx_sync_init(&tmp_sync, is, 0, "loggin temp sync");
  bounce = mx_kmalloc(MX_VPAGE_SIZE, MX_WAITOK|MX_MZERO);
  if (bounce == 0) {
    status = ENOMEM;
    goto abort_with_sync;
  }

  /* tell the MCP to start logging */
  status = mx_lanai_command(is, MX_MCP_CMD_START_LOGGING,
			    size, 0, 0, &dont_care, &tmp_sync);
  if (status) {
    MX_WARN (("Couldn't start logging\n"));
    goto abort_with_buffer;
  }

  /* wait for the MCP to log the requested amount of information */
  status = mx_sleep(&is->logging.sync, MX_LOGGING_WAIT, MX_SLEEP_INTR);
  if (status) {
    if (status == EAGAIN)
      MX_WARN(("time out waiting for logging buffer\n"));
    mx_sync_reset(&is->logging.sync);
    goto abort_with_buffer;
  }
  
  /* retrieve the logging data */
  size = (size > is->logging.size) ? is->logging.size : size;
  while (size > 0) {
    copy_size = (size > MX_VPAGE_SIZE) ? MX_VPAGE_SIZE : size;
    mx_pio_bcopy_read(sram_ptr, bounce, copy_size);
    status = mx_copyout(bounce, out, copy_size, is_kernel);
    if (status) {
      MX_WARN(("failed to copy logging buffer out to 0x%lx\n", 
	       (unsigned long) out));
      break;
    }

    size -= copy_size;
    sram_ptr += copy_size;
    out += copy_size;
  }

 abort_with_buffer:
  mx_kfree(bounce);
   
 abort_with_sync:
  mx_sync_destroy(&tmp_sync);
  mx_mutex_enter(&is->sync);
  is->flags &= ~MX_IS_LOGGING;
  mx_mutex_exit(&is->sync);
  return status;
}


static int
mx_get_counters(mx_instance_state_t *is, mx_uaddr_t out, uint32_t is_kernel)
{
  uint32_t mx_mcp_counters_count, mx_mcp_counters_offset, counters_len;
  const char **mx_mcp_counters;
  char *ptr, *start, *end;
  int status;
  char *tmp;
  
  status = mx_mcpi.get_counters(is->board_type, &mx_mcp_counters, 
				&mx_mcp_counters_count);
  if (status) {
    MX_WARN (("Don't know which counters we have; board type = %d\n",
	      is->board_type));
    return (ENXIO);
  }

  status = mx_mcpi.get_param(is->id, is->lanai.sram, 
			     "counters_offset", 
			     &mx_mcp_counters_offset);
  if (status) {
    MX_WARN (("Don't know where are the counters in the SRAM\n"));
    return (ENXIO);
  }
  
  start = (char *)is->lanai.sram;
  end = start + is->sram_size;
  ptr = start + (size_t)mx_mcp_counters_offset;
  counters_len = sizeof (uint32_t) * mx_mcp_counters_count;

  if (ptr < start || (ptr + counters_len) > end) {
    MX_WARN (("PIOed counters have bad size (0x%x) or location (0x%x)\n",
	      counters_len, mx_mcp_counters_offset));
    return EINVAL;
  }

  tmp = mx_kmalloc(counters_len, MX_WAITOK);
  if (tmp == NULL) {
    MX_WARN(("Could not malloc %d bytes for counters\n",
	     (int)counters_len));
    return ENOMEM;
  }
  mx_pio_bcopy_read(ptr, tmp, counters_len);
  status = mx_copyout(tmp, out, counters_len, is_kernel);
  mx_kfree(tmp);
  return status;
}


static int
mx_run_dmabench(mx_instance_state_t *is, mx_dmabench_t *x)
{
  mx_page_pin_t pin;
  char *alloc_addr, *dont_care_char;
  uint32_t dont_care_int;
  uint16_t count_size;
  int status, cmd;

  if ((x->count > 100) ||
      (1 << x->log_size) > MX_VPAGE_SIZE)
    return EINVAL;

  mx_mutex_enter(&is->sync);
  if (is->dmabench.busy) {
    mx_mutex_exit(&is->sync);
    return EBUSY;
  }
  is->dmabench.busy = 1;
  mx_mutex_exit(&is->sync);
  status = mx_alloc_dma_page(is, &alloc_addr, &dont_care_char, &pin);
  if (status)
    goto abort_busy;

  if (x->dma_read)
    cmd = MX_MCP_CMD_START_DMABENCH_READ;
  else 
    cmd = MX_MCP_CMD_START_DMABENCH_WRITE;
  count_size = (x->count << 8) | x->log_size;
  status = mx_lanai_command(is, cmd,
			    count_size,
			    pin.dma.high,
			    pin.dma.low, &dont_care_int,
			    &is->dmabench.cmd_sync);

  if (status) {
    MX_WARN(("Could not start DMA benchmark, status = %d\n", status));
    MX_WARN(("0x%x 0x%x\n", pin.dma.high, pin.dma.low));
    goto abort_with_dmapage;
  }

  status = mx_sleep(&is->dmabench.wait_sync, MX_COMMAND_WAIT * 2,
		    MX_SLEEP_NOINTR);

  x->count = is->dmabench.count;
  x->cycles = is->dmabench.cycles;

 abort_with_dmapage:
  mx_free_dma_page(is, &alloc_addr, &pin);

 abort_busy:
  mx_mutex_enter(&is->sync);
  is->dmabench.busy = 0;
  mx_mutex_exit(&is->sync);

  return status;
}


#ifndef MX_FACTORIZED_PAGE_PIN 

/*
 * Take an array of pins and pin each underlying host page, and fill
 * in the DMA address for each vpage in the mdesc array.  Note that
 * PAGE_SIZE could be > MX_VPAGE_SIZE, in which case not all pins
 * would be used.
 */

int
mx_pin_vpages(mx_instance_state_t *is, mx_page_pin_t *pins, 
	      mcp_dma_addr_t *mdesc, int nvpages,
	      int flags, uint64_t memory_context)
{
  int i, status;
  uint64_t va;
  uint32_t page_offset;
  unsigned long pindex, prev_pindex;
  mx_page_pin_t *last_pin = NULL;


  mx_assert(nvpages <= MX_ADDRS_PER_VPAGE);
  va = pins[0].va;
  prev_pindex = MX_ATOP(va) - 1;
  for (i = 0; i < nvpages; i++) {
    page_offset = (uint32_t)(va - MX_TRUNC_PAGE(va));
    pindex = MX_ATOP(va);

    /* pin the underlying host page if needed */
    if (prev_pindex != pindex) {
      pins[i].va = MX_TRUNC_PAGE(va);
      status = mx_pin_page(is, &pins[i], flags, memory_context);
      if (status)
	goto abort_with_pins;
      last_pin = &pins[i];
    } else {
      pins[i].dma.low = MX_DMA_INVALID_ENTRY;
    }
    prev_pindex = pindex;

    /* save the dma address of the current page */
    if (i + 1 < nvpages)
      mdesc[i+1].low = MX_DMA_INVALID_ENTRY;
    MX_STBAR();
    mdesc[i].high = htonl(last_pin->dma.high);
    mdesc[i].low = htonl(last_pin->dma.low + page_offset);

    va += (uint64_t)MX_VPAGE_SIZE;
  }

  return 0;

 abort_with_pins:
  mx_unpin_vpages(is, pins, i, flags);
  return status;
}

/*
 * Unpin all the host pages described in pins.  Note that when
 * PAGE_SIZE != MX_VPAGE_SIZE, some pin slots may not be valid.
 */

void
mx_unpin_vpages(mx_instance_state_t *is, mx_page_pin_t *pins, int npages, int flags)
{
  int i;

  for (i = 0; i < npages; i++)
    if (pins[i].dma.low != MX_DMA_INVALID_ENTRY)
      mx_unpin_page(is, &pins[i], flags);
}

#endif /* MX_FACTORIZED_PAGE_PIN */

static int
vpages_per_useg(mx_reg_seg_t *seg)
{
  mx_uaddr_t start, end;
  int page_count;

  if (seg->len == 0)
    return 0;
  page_count = 0;
  start = (mx_uaddr_t)seg->vaddr;
  end = start + seg->len - 1;
  page_count = (int)(MX_ATOVP(end) - MX_ATOVP(start) + 1);
 
  MX_DEBUG_PRINT(0, ("%d pages for address 0x%lx, len = %d\n", 
		     page_count, (unsigned long) start, seg->len));
  mx_assert (page_count >= 1);
  return page_count;
}

static void
mx_free_dma_table (mx_instance_state_t *is, mx_dma_table_t *tbl)
{
  if (tbl->log_level > 0) {
    if (tbl->u.tables) {
      int i;
      for (i=0;i<tbl->nb_subtables;i++) {
	if (tbl->u.tables[i]) {
	  mx_free_dma_table(is, tbl->u.tables[i]);
	  mx_kfree(tbl->u.tables[i]);
	}
	tbl->u.tables[i] = 0;
      }
      mx_kfree(tbl->u.tables);
    tbl->u.tables = 0;
    }
  } else if (tbl->u.pins) {
    mx_kfree(tbl->u.pins);
    tbl->u.pins = 0;
  }
  if (tbl->alloc_addr) {
    mx_free_dma_page(is,&tbl->alloc_addr,&tbl->pin);
    tbl->alloc_addr = 0;
  }
  return;
}

static int
mx_fill_dma_table (mx_instance_state_t *is, mx_dma_table_t *tbl)
{
  int rc;
  long nb_entries = tbl->nb_entries;
  /* only case of nb_entries == 0 is rdma win of null length */
  mx_assert(nb_entries > 0 || tbl->log_level == 0);
  mx_assert(tbl->log_level >= 0);

  tbl->nb_subtables = 0;
  tbl->alloc_addr = 0;
  tbl->u.tables = 0;
  rc = mx_alloc_dma_page(is, &tbl->alloc_addr, (char**)&tbl->desc, &tbl->pin);
  mx_mem_check(!rc);
  tbl->desc[0].low = MX_DMA_INVALID_ENTRY;
  if (tbl->log_level == 0) {
    mx_assert(nb_entries <= MX_ADDRS_PER_VPAGE);
    if (nb_entries) {
      tbl->u.pins = mx_kmalloc(sizeof(*tbl->u.pins)*nb_entries, MX_WAITOK | MX_MZERO);
      mx_mem_check(tbl->u.pins);
    }
  } else {
    int i;
    long remain = nb_entries;
    long sub_max_size = 1 << tbl->log_level;
    /* alloc sub-tables */
    tbl->nb_subtables = MX_ROUND_UP(nb_entries, sub_max_size) >> tbl->log_level;
    mx_assert(tbl->nb_subtables <= MX_ADDRS_PER_VPAGE);
    tbl->u.tables = mx_kmalloc(sizeof(tbl->u.tables[0])*tbl->nb_subtables, MX_WAITOK | MX_MZERO);
    mx_mem_check(tbl->u.tables);

    for (i=0;i<tbl->nb_subtables;i++) {
      int subentries;
      subentries = MIN(sub_max_size, remain);
      tbl->u.tables[i] = mx_kmalloc(sizeof(*tbl->u.tables[i]), MX_WAITOK | MX_MZERO);
      mx_mem_check(tbl->u.tables[i]);
      tbl->u.tables[i]->log_level = tbl->log_level - MX_ADDRS_PER_VPAGE_SHIFT;
      tbl->u.tables[i]->nb_entries = subentries;
      rc = mx_fill_dma_table(is, tbl->u.tables[i]);
      mx_mem_check(!rc);
      remain -= subentries;
      tbl->desc[i].high = htonl(tbl->u.tables[i]->pin.dma.high);
      tbl->desc[i].low = htonl(tbl->u.tables[i]->pin.dma.low);
    }
    if (tbl->nb_subtables <MX_ADDRS_PER_VPAGE) {
      tbl->desc[tbl->nb_subtables].low = MX_DMA_INVALID_ENTRY;
    }
  }
  return 0;

 handle_enomem:
  mx_free_dma_table(is,tbl);
  return ENOMEM;
}


static struct mx_dma_table *
mx_dma_table_for_seg(mx_host_dma_win_t *hc, long hseg)
{
  struct mx_dma_table *tbl;
  tbl = &hc->table;
  mx_assert(hseg < (MX_ADDRS_PER_VPAGE << tbl->log_level));
  mx_assert(hseg < tbl->nb_entries);
  while (tbl->log_level) {
    int tbl_index = (hseg >> tbl->log_level) & (MX_ADDRS_PER_VPAGE - 1);
    mx_assert(tbl_index < tbl->nb_subtables);
    tbl = tbl->u.tables[tbl_index];
  }
  mx_assert((hseg & (MX_ADDRS_PER_VPAGE - 1)) < tbl->nb_entries);
  return tbl;
}

void
mx_free_dma_win (mx_instance_state_t *is, mx_host_dma_win_t *hc)
{
  mx_free_dma_table(is, &hc->table);
  mx_kfree(hc);
}


/* Allocate a new host dma chunk container, and its associated vpages 
   The host dma chunk structure consists of an array of page pin
   info for all the pages in the send or receive, plus a linked list
   of all the mcp dma chunk structures.

   The mcp dma chunk structures contain a pointer to a vpage, where
   DMA addresses are stored in network byte order, and which the MCP 
   reads.  It also contains the pin for that vpage, and a pointer
   to the next chunk in the list.
*/

mx_host_dma_win_t *
mx_new_dma_win(mx_instance_state_t *is, int nsegs)
{
  mx_host_dma_win_t *hc;
  int rc;

  hc = mx_kmalloc(sizeof (*hc), MX_WAITOK | MX_MZERO);
  mx_mem_check(hc);

  hc->table.nb_entries = nsegs;
  while ((MX_ADDRS_PER_VPAGE << hc->table.log_level) < nsegs) {
    hc->table.log_level += MX_ADDRS_PER_VPAGE_SHIFT;
  }
  rc = mx_fill_dma_table(is,&hc->table);
  mx_mem_check(!rc);

  return hc;

 handle_enomem:
  if (hc != NULL)
    mx_free_dma_win(is, hc);
  return NULL;
}

static int
mx_register(mx_endpt_state_t *es, uint32_t handle, 
	    uint32_t num_usegs, mx_reg_seg_t *usegs,
	    uintptr_t memory_context)
{
  uint32_t flags;
  mx_instance_state_t *is = es->is;
  mx_host_dma_win_t *hc;
  int hseg, status, num_hsegs, npages;
  uint32_t length;
  struct mx_dma_table *tbl;
  mx_reg_seg_t * cur_useg;
  uint64_t cur_va;

  /* only one segment is supported in the general case */
  if (num_usegs > 1) {
    MX_WARN(("mx_register: scatter/gather is not yet supported \n"));
    return EINVAL;
  }

  /* use the first segment */
  length = usegs[0].len;
  if (MX_VPAGE_OFFSET(usegs[0].vaddr) || MX_VPAGE_OFFSET(length)) {
    MX_WARN(("mx_register: rdma window must use page boundaries\n"));
    return EINVAL;
  }

  /* find the number of dma segments required for this registration */
  num_hsegs = vpages_per_useg(&usegs[0]);

  /* user endpoint may only use user memory (and should be passed */
  if (!es->is_kernel) {
    if (memory_context != MX_PIN_UNDEFINED) {
      MX_WARN(("mx_register: user memory context should be undefined\n"));
      return EINVAL;
    }
    memory_context = mx_get_memory_context();
  }

  if (num_hsegs < 0 || num_hsegs >= 0x7FFFFFFF / MX_PAGE_SIZE) {
    return EINVAL;
  }

  /* grab the es mutex, so as to ensure that no other thread is racing
     to register something on this handle.  This is theoretically not
     needed, but is here to protect us from malicous users */
  mx_mutex_enter(&es->sync);

  if (es->host_rdma_windows[handle].win) {
    /* reinitialize the handle for the MCP */
    hc = es->host_rdma_windows[handle].win;
    num_hsegs = hc->table.nb_entries;
    MX_STBAR();
    tbl = num_hsegs > 0 ? mx_dma_table_for_seg(hc,0) : &hc->table;
    MX_PIO_WRITE(&(es->mcp_rdma_windows[handle].index), 0);
    MX_PIO_WRITE(&(es->mcp_rdma_windows[handle].addr.high), htonl(tbl->pin.dma.high));
    MX_PIO_WRITE(&(es->mcp_rdma_windows[handle].addr.low), htonl(tbl->pin.dma.low));
    tbl = num_hsegs > MX_ADDRS_PER_VPAGE ? mx_dma_table_for_seg(hc,MX_ADDRS_PER_VPAGE) : &hc->table;
    MX_PIO_WRITE(&(es->mcp_rdma_windows[handle].next.high), htonl(tbl->pin.dma.high));
    MX_PIO_WRITE(&(es->mcp_rdma_windows[handle].next.low), htonl(tbl->pin.dma.low));
    status = 0;
    goto abort;
  }

  mx_assert(MX_PIO_READ(&(es->mcp_rdma_windows[handle].addr.low)) == MX_DMA_INVALID_ENTRY);

  /* attempt to allocate a container for them, ensure we have at least
     one descriptor to avoid special cases */
  hc =  mx_new_dma_win(is, num_hsegs);
  if (!hc) {
    MX_WARN(("cannot mx_new_dma_win(%d)\n", num_hsegs));
    status = ENOMEM;
    goto abort;
  }

  /* walk the region, pinning pages as we go and storing the DMA address
     in the mcp chunks */

  if (length == 0) {
    /* make first dma desc points to the directory page, the mcp might
       use it with a zero-length dma */
    mx_assert(hc->table.log_level == 0);
    hc->table.desc[0].low = MX_DMA_INVALID_ENTRY;
  }

  flags = MX_PIN_STREAMING | MX_AS_TO_PIN_FLAGS(memory_context);
  hc->flags = flags;

  cur_useg = &usegs[0];
  cur_va = cur_useg->vaddr;
  for (hseg = 0; hseg < num_hsegs; hseg += MX_ADDRS_PER_VPAGE) {
    int i;
    npages = MIN(num_hsegs - hseg, MX_ADDRS_PER_VPAGE);
    tbl = mx_dma_table_for_seg(hc, hseg);
    mx_assert(npages <= tbl->nb_entries);

    for(i = 0; i < npages; i++, cur_va += MX_VPAGE_SIZE)
      tbl->u.pins[i].va = cur_va;

    status = mx_pin_vpages(is, tbl->u.pins, tbl->desc, npages, 
			   flags, memory_context);
    if (status) {
      MX_WARN(("cannot pin pages\n"));
      for(i = 0; i < npages; i++)
	tbl->u.pins[i].va = 0;
      goto abort_with_pins;
    }
  }

  /* XXX add some code here to tell the mcp where to start */

  if (es->host_rdma_windows[handle].win) {
    status = EBUSY;
    MX_WARN(("mx_register: lost race to register handle %d\n", handle));
    goto abort_with_pins;
  }
  es->host_rdma_windows[handle].win = hc;
  MX_STBAR();
  MX_PIO_WRITE(&(es->mcp_rdma_windows[handle].length), htonl(length));
  MX_PIO_WRITE(&(es->mcp_rdma_windows[handle].index), 0);
  tbl = num_hsegs > 0 ? mx_dma_table_for_seg(hc,0) : &hc->table;
  MX_PIO_WRITE(&(es->mcp_rdma_windows[handle].addr.high), htonl(tbl->pin.dma.high));
  MX_PIO_WRITE(&(es->mcp_rdma_windows[handle].addr.low), htonl(tbl->pin.dma.low));
  tbl = num_hsegs > MX_ADDRS_PER_VPAGE ? mx_dma_table_for_seg(hc,MX_ADDRS_PER_VPAGE) : &hc->table;
  MX_PIO_WRITE(&(es->mcp_rdma_windows[handle].next.high), htonl(tbl->pin.dma.high));
  MX_PIO_WRITE(&(es->mcp_rdma_windows[handle].next.low), htonl(tbl->pin.dma.low));

  mx_mutex_exit(&es->sync);

  return 0;

 abort_with_pins:
  for (hseg = 0; hseg < num_hsegs; hseg += MX_ADDRS_PER_VPAGE) {
    tbl = mx_dma_table_for_seg(hc,hseg);
    npages = MIN(num_hsegs - hseg, MX_ADDRS_PER_VPAGE);
    mx_assert(npages <= tbl->nb_entries);
    if (tbl->u.pins[0].va)
      mx_unpin_vpages(is, tbl->u.pins, npages, flags);
  }
  mx_free_dma_win(is, hc);
  es->host_rdma_windows[handle].win = NULL;

 abort:
  mx_mutex_exit(&es->sync);
  return status;
}

static int
mx_deregister(mx_endpt_state_t *es, uint32_t handle)
{
  int hseg, npages;
  mx_instance_state_t *is = es->is;
  mx_host_dma_win_t *hc;
  int flags;

  mx_mutex_enter(&es->sync);
  if (es->host_rdma_windows[handle].win == 0) {
    mx_mutex_exit(&es->sync);
    return EINVAL;
  }
  hc = es->host_rdma_windows[handle].win;
  es->host_rdma_windows[handle].win = 0;
  mx_assert(MX_PIO_READ(&(es->mcp_rdma_windows[handle].addr.low)) != MX_DMA_INVALID_ENTRY);
  /* FIXME: we trust the lib that the MCP has finished using this
   window, and mark it done free ourselves eventually the MCP should
   mark it unused when it's done and we should check that it is before
   freeing anything */
  es->mcp_rdma_windows[handle].addr.low = MX_DMA_INVALID_ENTRY;
  mx_mutex_exit(&es->sync);

  flags = hc->flags;

  for (hseg = 0; hseg < hc->table.nb_entries; hseg += MX_ADDRS_PER_VPAGE) {
    struct mx_dma_table *tbl = mx_dma_table_for_seg(hc, hseg);
    npages = MIN(hc->table.nb_entries - hseg, MX_ADDRS_PER_VPAGE);
    mx_assert(npages <= tbl->nb_entries);
    mx_unpin_vpages(is, tbl->u.pins, npages, flags);
  } 	

  mx_free_dma_win(es->is, hc);
  return 0;
}


/*
 * Iterate over a locked endpoint state, calling testfp()
 * on each pin held by this endpoint.  This is needed for
 * Solaris to be able to release pinned memory when exiting
 * uncleanly 
 */

mx_page_pin_t *
mx_find_pin(mx_endpt_state_t *es,  int (*testfp)(mx_page_pin_t *pin, void *arg),
	    void *arg)
{
  int hseg, npages, i, handle, found;
  mx_host_dma_win_t *hc;

  for (handle = 0; handle < mx_max_rdma_windows; handle++) {
    if (es->host_rdma_windows == NULL)
      return NULL;

    if (es->host_rdma_windows[handle].win == 0)
      continue;

    hc = es->host_rdma_windows[handle].win;
    for (hseg = 0; hseg < hc->table.nb_entries; hseg += MX_ADDRS_PER_VPAGE) {
      struct mx_dma_table *tbl = mx_dma_table_for_seg(hc, hseg);
      npages = MIN(hc->table.nb_entries - hseg, MX_ADDRS_PER_VPAGE);
      mx_assert(npages <= tbl->nb_entries);
      for (i = 0; i < npages; i++) {
	found = (*testfp)(&tbl->u.pins[i], arg);
	if (found)
	  return &tbl->u.pins[i];
      }
    } 	
  }
  return NULL;
}

#if !MX_DISABLE_COMMON_COPYBLOCK
void
mx_common_free_copyblock(mx_instance_state_t *is, mx_copyblock_t *cb)
{
  uint32_t i;

  if (cb->pins != NULL) {
    for (i = 0; i < (cb->size / PAGE_SIZE); i++) {
      if (cb->pins[i].va != 0) {
	mx_unpin_page(is, &cb->pins[i], MX_PIN_KERNEL | MX_PIN_CONSISTENT);
	mx_unreserve_page((void *)(uintptr_t)cb->pins[i].va);
      }
    }
    mx_kfree(cb->pins);
  }
  mx_kfree(cb->alloc_addr);

  bzero(cb, sizeof(*cb));
}

int
mx_common_alloc_copyblock(mx_instance_state_t *is, mx_copyblock_t *cb)
{
  int i, status;
  uint64_t va;
  uint32_t len;

  len = cb->size;

  /* make sure that the length is page aligned */
  mx_always_assert((len & (PAGE_SIZE - 1)) == 0);

  cb->alloc_addr = mx_kmalloc(len + PAGE_SIZE, MX_MZERO|MX_WAITOK);
  if (cb->alloc_addr == NULL) {
    MX_WARN(("copyblock allocation failed due to lack of memory\n"));
    return ENOMEM;
  }
  cb->addr =  (void *)(uintptr_t)MX_PAGE_ALIGN((uintptr_t)cb->alloc_addr);

  cb->pins = mx_kmalloc(sizeof(cb->pins[0]) * (len / PAGE_SIZE), MX_MZERO|MX_WAITOK);
  if (cb->pins == NULL) {
    MX_WARN(("copyblock pin info allocation failed due to lack of memory\n"));
    status = ENOMEM;
    goto abort_with_cb;
  }
  
  for (i = 0, va = (uint64_t)(size_t)cb->addr; 
       va < (uint64_t)(size_t)((char *)cb->addr + len); 
       va += PAGE_SIZE, i++) {
    cb->pins[i].va = va;
    status = mx_pin_page(is, &cb->pins[i],
			 MX_PIN_KERNEL | MX_PIN_CONSISTENT, 0);
    if (status) {
      MX_WARN(("Failed to pin copyblock, status = %d\n", status));	
      cb->pins[i].va = 0;
      goto abort_with_cb;
    }
    mx_reserve_page((void *)(uintptr_t)va);
  }
  return(0);

 abort_with_cb:
  mx_common_free_copyblock(is, cb);
  return (status);
}

#endif

#define mx_endpt_alloc_copyblock(es,cb) (MX_KERNEL_LIB && (es)->is_kernel ? \
  mx_common_alloc_copyblock((es)->is,cb) : mx_alloc_copyblock((es)->is,cb))
#define mx_endpt_free_copyblock(es,cb) (MX_KERNEL_LIB && (es)->is_kernel ? \
  mx_common_free_copyblock((es)->is,cb) : mx_free_copyblock((es)->is,cb))

static int
mx_alloc_copyblocks(mx_endpt_state_t *es)
{
  int status;
  uint32_t vpages, offset;
  mx_instance_state_t *is;

  is = es->is;

  status = mx_mcpi.get_param(is->id, is->lanai.sram, 
			     "MX_MCP_SENDQ_VPAGE_CNT", &vpages);
  if (status) {
    MX_WARN (("Can't determine sendq size \n"));
    goto abort_with_nothing;
  }
  es->sendq.size = (uint32_t)MX_VPTOA(vpages);
  status = mx_endpt_alloc_copyblock(es, &es->sendq);
  if (status) {
    goto abort_with_nothing;
  }

  status = mx_mcpi.get_param(is->id, is->lanai.sram, 
			     "MX_MCP_RECVQ_VPAGE_CNT", &vpages);
  if (status) {
    MX_WARN (("Can't determine recvq size \n"));
    goto abort_with_sendq;
  }
  es->recvq.size = (uint32_t)MX_VPTOA(vpages);
  status = mx_endpt_alloc_copyblock(es, &es->recvq);
  if (status) {
    goto abort_with_sendq;
  }

  status = mx_mcpi.get_param(is->id, is->lanai.sram, 
			     "MX_MCP_EVENTQ_VPAGE_CNT", &vpages);
  if (status) {
    MX_WARN (("Can't determine eventq size \n"));
    goto abort_with_recvq;
  }
  es->eventq.size = (uint32_t)MX_VPTOA(vpages);
  status = mx_endpt_alloc_copyblock(es, &es->eventq);
  if (status) {
    goto abort_with_recvq;
  }

  /* this "copyblock" is really NIC SRAM; don't allocate anything.
     Just figure out where it lives.  We have the entire nic mapped
     into SRAM, so there's nothing to undo if anything fails. 
  */
  status = mx_mcpi.get_param(is->id, is->lanai.sram, 
			     "MX_MCP_UMMAP_SIZE", 
			     &es->user_mmapped_sram.size);
  if (status) {
    MX_WARN (("Can't determine user mmapped sram size \n"));
    goto abort_with_eventq;
  }

  status = mx_lanai_command(is, MX_MCP_CMD_GET_USER_MMAP_OFFSET,
                            es->endpt, 0, 0, &offset, &es->cmd_sync);
  if (status) {
    MX_WARN (("Can't determine ucmdq offset \n"));
    goto abort_with_eventq;
  }
  es->user_mmapped_sram.addr = 
    ((char *)(is->lanai.sram + (unsigned long)offset));

  if (offset + es->user_mmapped_sram.size > is->sram_size) {
    MX_WARN (("User mapped sram has bad size (0x%x) or location (0x%x)\n",
	      es->user_mmapped_sram.size, offset));
  }
  es->user_mmapped_zreq.addr = (char*)is->lanai.sram + (1<<23) + (es->endpt << 16);
  if (is->board_type == MX_BOARD_TYPE_Z) {
    es->user_mmapped_zreq.size = MX_MCP_UMMAP_SIZE;
  }
  return (0);

 abort_with_eventq:
  mx_endpt_free_copyblock(es, &es->eventq);
  
 abort_with_recvq:
  mx_endpt_free_copyblock(es, &es->recvq);
  
 abort_with_sendq:
  mx_endpt_free_copyblock(es, &es->sendq);

  /* make sure mx_common_close won't try a second time */
  es->eventq.pins = es->recvq.pins = es->sendq.pins = 0;

 abort_with_nothing:
  return (status);
}


static int
mx_dma_map_copyblock(mx_endpt_state_t *es, mx_copyblock_t *cb, uint32_t cmd)
{
  mx_instance_state_t *is;
  mcp_dma_addr_t *dma;
  uint32_t offset, host_offset;
  int status;
  mx_uaddr_t mcp_index, host_index;
  unsigned long addr;

  is = es->is;
  /* first get the offset into NIC sram for this copyblock's DMA addresses */
  status = mx_lanai_command(is, cmd, es->endpt, 0, 0, &offset, &es->cmd_sync);
  if (status) {
    MX_WARN (("could not find offset for copyblock 0x%x\n", cmd));
    return (status);
  }
  dma = (mcp_dma_addr_t *)((char *)(is->lanai.sram + (unsigned long)offset));

  /* Walk the copyblock, storing the DMA address for each virtual page
     down on the nic.  Note that on hosts with large page sizes, two
     or more virtual pages may map to offsets within the same host
     page. */
     
  for (addr = 0; addr < cb->size; addr += MX_VPAGE_SIZE) {
    mcp_index = MX_ATOVP(addr);
    host_index = MX_ATOP(addr);

    /* compute the offset into the (possibly larger) host page */
    host_offset = (uint32_t)(addr - MX_TRUNC_PAGE(addr));
    
#if 1   /* assume 64 bit-writes are not causing padding fifo overflow */
    MX_PIO_WRITE((uint64_t*)&dma[mcp_index], 
		 mx_hton_u64(((uint64_t)cb->pins[host_index].dma.high << 32) +
			     cb->pins[host_index].dma.low + host_offset));
#else
    MX_PIO_WRITE(&(dma[mcp_index].high), htonl(cb->pins[host_index].dma.high));
    MX_PIO_WRITE(&(dma[mcp_index].low), htonl(cb->pins[host_index].dma.low + host_offset));
#endif
    MX_STBAR();
    
  }
  return (0);
}

static int
mx_dma_map_copyblocks(mx_endpt_state_t *es)
{
  int status = 0;

  status |= mx_dma_map_copyblock(es, &es->sendq, 
				 MX_MCP_CMD_GET_HOST_SENDQ_OFFSET);

  status |= mx_dma_map_copyblock(es, &es->recvq, 
				 MX_MCP_CMD_GET_HOST_RECVQ_OFFSET);

  status |= mx_dma_map_copyblock(es, &es->eventq, 
				 MX_MCP_CMD_GET_HOST_EVENTQ_OFFSET);

  return (status);
}


int
mx_open_mcp_endpoint(mx_endpt_state_t *es)
{
  mx_instance_state_t *is;
  uint32_t dummy;
  uint32_t rdma_window_offset;
  int status = 0;

  is = es->is;
  
  /* open the endpoint on the mcp */
  status = mx_lanai_command(is, MX_MCP_CMD_OPEN_ENDPOINT, es->endpt, 0, 0,
			    &dummy, &es->cmd_sync);
  if (status != 0) {
    MX_WARN(("mx%d: endpt %d, mcp open failed\n", is->id, es->endpt));
    return (status);
  }

  status = mx_lanai_command(is, MX_MCP_CMD_GET_RDMA_WINDOWS_OFFSET, es->endpt,
			    0, 0, &rdma_window_offset, &es->cmd_sync);
  if (status != 0) {
    MX_WARN(("mx%d: endpt %d, mcp get rdma window offset failed\n", is->id, es->endpt));
    return (status);
  }
  es->mcp_rdma_windows = (mcp_rdma_win_t *)((char *)is->lanai.sram  + rdma_window_offset);
  
  /* tell the MCP about the copyblock's DMA addresses */
  status = mx_dma_map_copyblocks(es);
  if (status) {
    MX_WARN(("mx%d: endpt %d, mapping copyblock failed\n", is->id, es->endpt));
    return (status);
  }

  /* set the endpoint session id */
  status = mx_lanai_command(is, MX_MCP_CMD_SET_ENDPOINT_SESSION, es->endpt,
			    es->session_id, 0, &dummy, &es->cmd_sync);
  if (status != 0) {
    MX_WARN(("mx%d: endpt %d, set endpt session_id failed\n", is->id, es->endpt));
    return (status);
  }
 
  /* enable the endpoint on the mcp */
  status = mx_lanai_command(is, MX_MCP_CMD_ENABLE_ENDPOINT, es->endpt, 0, 0,
			    &dummy, &es->cmd_sync);
  if (status != 0) {
    MX_WARN(("mx%d: endpt %d, endpt enable failed\n", is->id, es->endpt));
    return (status);
  }
  return status;
}

int
mx_common_open(int32_t unit, int32_t endpoint, mx_endpt_state_t *es, int raw)
{
  int status = 0;
  mx_instance_state_t *is;
  int privileged = es->privileged | mx_security_disabled;

  /* make sure that the requested endpoint is in range */
  if (!raw && (endpoint < 0 || endpoint >= mx_max_endpoints)) {
    return (ENODEV);
  }

  is = mx_get_instance(unit);
  if (is == 0) {
    return (ENODEV);
  }

  mx_mutex_enter(&is->sync);

  if (mx_is_dead(is)) {
    mx_mutex_exit(&is->sync);
    mx_release_instance(is);
    return (ENXIO);
  }

  if (!raw) {
    if (is->es[endpoint] != 0) {
      mx_mutex_exit(&is->sync);
      mx_release_instance(is);
      return (EBUSY);
    }

    is->es[endpoint] = es;
    es->is = is;
    es->endpt = endpoint;
    es->session_id = mx_rand() | 0x40000000U;
    es->session_id &= ~0x80000000U;
    es->flags = MX_ES_API;
  } else {
    /* can have just one open of raw message api */
    if (is->raw.use_count != 0) {
      mx_mutex_exit(&is->sync);
      mx_release_instance(is);
      return (EBUSY);
    }
    /* and the endpoint must be privileged */
    if (!privileged) {
      mx_mutex_exit(&is->sync);
      mx_release_instance(is);
      return EPERM;
    }
    es->is = is;
    es->flags = MX_ES_RAW;
    is->raw.use_count++;
    mx_mutex_exit(&is->sync);
    status = mx_kraw_init(is);
    mx_mutex_enter(&is->sync);
    is->raw.es = es;
    if (status) {
      MX_WARN(("mx%d: failed to setup raw interface. Status %d\n", 
	       is->id, status));
      is->raw.es = 0;
      es->is = 0;
      es->flags = MX_ES_INVALID;
      is->raw.use_count--;
      mx_mutex_exit(&is->sync);
      mx_release_instance(is);
      return status;
    }
  }
  es->ref_count = 0;
  mx_sync_init(&es->sync, is, endpoint, "es->sync");
  mx_sync_init(&es->cmd_sync, is, endpoint, "es->cmd_sync");
  mx_sync_init(&es->wait_sync, is, endpoint, "es->wait_sync");
  mx_sync_init(&es->app_wait_sync, is, endpoint, "es->app_wait_sync");
  mx_sync_init(&es->close_sync, is, endpoint, "es->close_sync");

  mx_mutex_exit(&is->sync);
  if (raw)
    return 0;

  /* allocate the DMA chunks */
  es->host_rdma_windows = mx_kmalloc(sizeof (es->host_rdma_windows[0]) 
				    * mx_max_rdma_windows, MX_MZERO);
  if (es->host_rdma_windows == NULL) {
    mx_common_close(es);
    return (ENOMEM);
  }
    
  /* allocate the copyblocks */
  status = mx_alloc_copyblocks(es);
  if (status) {
    MX_WARN(("mx%d: endpt %d, copyblock alloc failed, status = %d\n", 
	     is->id, es->endpt, status));
    mx_common_close(es);
    return (status);
  }

  status = mx_open_mcp_endpoint(es);
  if (status != 0) {
    MX_WARN(("mx%d: endpt %d, mx_open_mcp_endpoint failed\n", is->id, es->endpt));
    mx_common_close(es);
    return (status);
  }

  es->parity_errors_detected = is->parity_errors_detected;
  es->parity_errors_corrected = is->parity_errors_corrected;

  return 0;
}

mx_endpt_state_t *
mx_get_endpoint(mx_instance_state_t *is, int endpt)
{
  mx_endpt_state_t *es;

  if ((uint32_t) endpt >= mx_max_endpoints)
    return NULL;

  mx_mutex_enter(&is->sync);
  es = is->es[endpt];
  if (es == NULL)
    goto abort_with_is;
  mx_mutex_enter(&es->sync);
  if (es->flags & MX_ES_CLOSING) {
    mx_mutex_exit(&es->sync);
    es = NULL;
    goto abort_with_is;
  }
  es->ref_count++;
  mx_mutex_exit(&es->sync);

 abort_with_is:
  mx_mutex_exit(&is->sync);
  return es;
}

void
mx_put_endpoint(mx_endpt_state_t *es)
{
  mx_mutex_enter(&es->sync);
  if ((es->flags & MX_ES_CLOSING) && (es->ref_count == 1))
    mx_wake(&es->sync);
  es->ref_count--;
  mx_mutex_exit(&es->sync);
}

void
mx_common_close(mx_endpt_state_t *es)
{

  mx_instance_state_t *is;
  uint32_t dummy;
  int i, status = 0;
  int raw = es->flags & MX_ES_RAW;
  int close_loop_cnt = 0;
  
  is = es->is;

  /* wait for all references to go away */
 again:
  mx_mutex_enter(&es->sync);
  es->flags |= MX_ES_CLOSING;
  if (es->ref_count != 0) {
    if (close_loop_cnt == 0)
      MX_INFO(("mx%d: closing endpoint %d with %d references, sleeping\n", 
	       es->is->id, es->endpt, es->ref_count));
    mx_mutex_exit(&es->sync);
    mx_sleep(&es->sync, MX_CLOSE_WAIT, MX_SLEEP_NOINTR);
    close_loop_cnt++;
    goto again;
  }
  mx_mutex_exit(&es->sync);

  if (!raw)
    status = mx_lanai_command(is, MX_MCP_CMD_CLOSE_ENDPOINT, es->endpt, 0, 0, 
			      &dummy, &es->cmd_sync);

  if (status != 0)
      MX_WARN(("mx%d: Failed to close endpoint %d on mcp\n",
	       is->id, es->endpt));


  /* sleep for up to 10 seconds waiting for the mcp to close the
     endpoint before we begin to free resources which it may
     depend on
  */
  if (!raw && status == 0) {
    status = mx_sleep(&es->close_sync, MX_CLOSE_WAIT, MX_SLEEP_NOINTR);
    if (status) {
      MX_WARN(("mx%d: Timed out waiting for MCP to close endpoint %d\n",
	       is->id, es->endpt));
      mx_mark_board_dead(is, MX_DEAD_ENDPOINT_CLOSE_TIMEOUT, es->endpt);
    }
  }

  mx_mutex_enter(&is->sync);
  if (raw) 
    is->raw.es = NULL;
  else 
    is->es[es->endpt] = 0;
  mx_mutex_exit(&is->sync);

  if (!raw) {
    if (es->host_rdma_windows) {
      for (i = 0; i < mx_max_rdma_windows; i++)
	if (es->host_rdma_windows[i].win )
	  mx_deregister(es, i);
      mx_kfree(es->host_rdma_windows);
      es->host_rdma_windows = NULL;
    }
    es->mcp_rdma_windows = 0;

    if (es->sendq.pins)
      mx_endpt_free_copyblock(es, &es->sendq);
    if (es->recvq.pins)
      mx_endpt_free_copyblock(es, &es->recvq);
    if (es->eventq.pins)
      mx_endpt_free_copyblock(es, &es->eventq);
  } else {
    mx_kraw_destroy(is);
    mx_mutex_enter(&is->sync);
    is->raw.use_count--;
    mx_mutex_exit(&is->sync);
  }
  mx_sync_destroy(&es->close_sync);
  mx_sync_destroy(&es->wait_sync);
  mx_sync_destroy(&es->app_wait_sync);
  mx_sync_destroy(&es->cmd_sync);
  mx_sync_destroy(&es->sync);
  mx_release_instance(is);
}

static void
mx_dump_interrupt_queues(mx_instance_state_t *is)
{
  int intrq, slot, type;

  MX_PRINT(("--------- Dumping interrupt queue state ----- \n"));
  MX_PRINT(("currently expecting interrupts on queue %d, slot=%d, seq=%d\n", 
	    is->intr.intrq, is->intr.slot, is->intr.seqnum));
  /*  MX_PRINT(("REQ_ACK_0 = %x\n", mx_read_lanai_special_reg_u32(is, lx.ISR) & MX_LX_REQ_ACK_0));*/
  MX_PRINT((" q  slot  status \n"));
  MX_PRINT(("--- ---- -------- \n"));
  for (intrq = 0; intrq < 2; intrq++) {
    for (slot = 0; slot < is->intr.maxslots; slot++) {
      mcp_slot_t *slotp = is->intr.q[intrq] + slot;
      type = slotp->type;
      if (type == 0 && slot >= 8)
	continue;
      MX_PRINT(("[%d]:[%d]: type=%d   flag=%d   index=%d\n"
		"\t seq=0x%x \tdata0=0x%x data1=0x%x\n",
		intrq, slot, type, slotp->flag, ntohs(slotp->index), 
		ntohl(slotp->seqnum), ntohl(slotp->data0), ntohl(slotp->data1)));
    }
  }

}

int
mx_common_interrupt(mx_instance_state_t *is)
{
  uint8_t type;
  int intrq, maxslots, idx, claimed, call_eth_send_done;
  uint32_t eth_send_done_slot;
  unsigned int slot;

  call_eth_send_done = 0;
  eth_send_done_slot = 0; /* -Wunitialized */
  intrq = is->intr.intrq;
  maxslots = is->intr.maxslots;
#if MX_DEBUG
  is->intr.cnt++;
#endif  

  /* don't try to handle unless interrupt queues have been setup */
  if (maxslots == 0)
    return 0;

  switch (is->board_ops.might_be_interrupting(is)) {
  case MX_INTR_ACTIVE:
    /* We know we can immediately claim the interrupt */
    is->board_ops.claim_interrupt(is);
    MX_STBAR();    
    claimed = 1;
    break;
  case MX_INTR_NONE:
    return 0;
  default:
    /* Check to see if we have the last event in the queue ready.  If
       so, ack it as early as possible.  This allows more time to get
       the interrupt line de-asserted prior to the EOI and reduces the
       chance of seeing a spurious irq caused by the interrupt line
       remaining high after EOI */
    slot = ntohs(is->intr.q[intrq][0].index) - 1;
    if (slot < maxslots && 
	is->intr.q[intrq][slot].type  != 0 &&
	is->intr.q[intrq][slot].flag != 0) {
      is->board_ops.claim_interrupt(is);
      MX_STBAR();    
      claimed = 1;
    } else {
      claimed = 0;
    }
  }
  
  /* walk each slot in the current queue, processing events until
     we reach an event with a zero type */
  for (slot = is->intr.slot; slot < maxslots; slot++) {
    type = is->intr.q[intrq][slot].type;
    MX_READBAR();
    if (claimed && !type) {
      MX_WARN(("Board %d: slot %d: null type found while interrupt is claimed\n", 
	       is->id, slot));
      mx_dump_interrupt_queues(is);
      is->board_ops.disable_interrupt(is);
      return 0;
    } else if (!claimed) {
      /* look if there is something in the queue */
      if (type == 0) {
	/* save the current slot for the next time we (re-)enter this
	   routine */
	is->intr.slot = slot;
#if MX_DEBUG
	is->intr.spurious++;
#endif
	if (call_eth_send_done)
	  mx_ether_tx_done(is, ntohl(eth_send_done_slot));
	return 0;
      }
    }
    
    if (ntohl(is->intr.q[intrq][slot].seqnum) != is->intr.seqnum) {
      MX_WARN(("bad interrupt sequence number (slot %d, got %d, expected%d), disabling interrupts\n", 
	       slot, (unsigned int)ntohl(is->intr.q[intrq][slot].seqnum), is->intr.seqnum));
      mx_dump_interrupt_queues(is);
      is->board_ops.disable_interrupt(is);
      return 0;
      
    }
    is->intr.seqnum += 1;
    
    switch (type) {
    case MX_MCP_INTR_INIT_DONE:
      mx_wake(&is->init_sync);
      MX_DEBUG_PRINT(MX_DEBUG_INTR, ("initialization done\n"));
      break;
      
    case MX_MCP_INTR_CMD_ACK:
      /* fallthrough */
    case MX_MCP_INTR_CMD_NACK:
      mx_lanai_command_complete(is, ntohl(is->intr.q[intrq][slot].data0),
				ntohl(is->intr.q[intrq][slot].data1),
				type == MX_MCP_INTR_CMD_ACK ? 0 : -1);
      break;

    case MX_MCP_INTR_RAW_RECV:
      if (is->raw.es != NULL)
	mx_kraw_rx_intr(is, ntohl(is->intr.q[intrq][slot].data0),
		       ntohl(is->intr.q[intrq][slot].data1));
      break;

    case MX_MCP_INTR_RAW_SEND:
      is->kreqq_completed++;
      if (is->raw.es != NULL)
	mx_kraw_tx_intr(is, ntohl(is->intr.q[intrq][slot].data0),
		       ntohl(is->intr.q[intrq][slot].data1));
      
      break;

    case MX_MCP_INTR_MAPPER_TICK:
      mx_kraw_tick(is);
      break;

    case MX_MCP_INTR_ETHER_SEND_DONE:
      call_eth_send_done = 1;
      eth_send_done_slot = is->intr.q[intrq][slot].data0;
      break;

    case MX_MCP_INTR_ETHER_RECV_SMALL:
    case MX_MCP_INTR_ETHER_RECV_BIG:
      {
	int flags, count, length, ip_csum;
	uint32_t raw;

	raw = ntohl(is->intr.q[intrq][slot].data0);
	count = 0xff & raw;
	flags = raw >> 8;
	raw = ntohl(is->intr.q[intrq][slot].data1);
	ip_csum = raw >> 16;
	length = 0xffff & raw;
	if (type == MX_MCP_INTR_ETHER_RECV_SMALL)
	  mx_ether_rx_done_small(is, count, length, ip_csum, flags);
	else
	  mx_ether_rx_done_big(is, count, length, ip_csum, flags);
      }
      break;

    case MX_MCP_INTR_QUERY:
      {
	int index;

	is->kreqq_completed++;
	if (is->intr.q[intrq][slot].data0 == htonl(MX_MCP_QUERY_HOSTNAME)) {
	  index = ntohl(is->intr.q[intrq][slot].data1);
	  mx_name_peer(is, index);
	}
      }
      break;

    case MX_MCP_INTR_WAKE:
    case MX_MCP_INTR_ENDPT_ERROR:
    case MX_MCP_INTR_ENDPT_CLOSED:
      {
	uint32_t endpoint = ntohl(is->intr.q[intrq][slot].data0);
	mx_endpt_state_t *es;


	if (endpoint >= mx_max_endpoints) {
	  MX_WARN(("mx%d:tried to wake invalid endpoint 0x%x\n", 
		   is->id, endpoint));
	  break;
	}

	if ((es = is->es[endpoint]) == 0) {
	  MX_WARN(("mx%d:tried to wake closed endpoint 0x%x\n", 
		   is->id, endpoint));
	  break;
	}
	switch (type) {
	case MX_MCP_INTR_ENDPT_ERROR:
	  MX_WARN(("mx%d:mcp detected endpoint error %d on endpoint 0x%x\n", 
		   is->id, ntohl(is->intr.q[intrq][slot].data1), endpoint));
	  es->endpt_error = MX_WAIT_ENDPT_ERROR;
	  is->intr.q[intrq][slot].data1 = 0;
	  /* fallthrough */
	case MX_MCP_INTR_WAKE:
	  if (ntohl(is->intr.q[intrq][slot].data1) == 0) {
	    /* a wake interrupt which is unrelated to a wake request
	       (caused by a eventq full or an incoming
	       rndv packet have data1) == 0 */
	    mx_atomic_add(1, &es->no_mcp_req_wakeups);
	    mx_wake(&es->wait_sync);
	  } else {
	    if (es->app_waiting) {
	      es->app_waiting = 0;
	      mx_wake(&es->app_wait_sync);	      
	    } else {
	      mx_wake(&es->wait_sync);
	    }
	  }
	  break;
	case MX_MCP_INTR_ENDPT_CLOSED:
	  mx_wake(&es->close_sync);
	  break;
	default:
	  break;
	}

	break;
      }
      
    case MX_MCP_INTR_LOGGING:
      is->logging.size = ntohl(is->intr.q[intrq][slot].data0);
      mx_wake(&is->logging.sync);
      break;
      
    case MX_MCP_INTR_PRINT:
      idx = ntohl(is->intr.q[intrq][slot].data0);
      mx_lanai_print(is, idx);
      break;

    case MX_MCP_INTR_DMABENCH:
      is->dmabench.cycles = ntohl(is->intr.q[intrq][slot].data0);
      is->dmabench.count = ntohl(is->intr.q[intrq][slot].data1);
      mx_wake(&is->dmabench.wait_sync);
      break;

    case MX_MCP_INTR_RDMAWIN_UPDATE:
      if (1)
      {
	uint32_t ep_rdmawin = ntohl(is->intr.q[intrq][slot].data0);
	uint32_t index = ntohl(is->intr.q[intrq][slot].data1);
	uint16_t epnum = ep_rdmawin >> 16;
	uint16_t rdmawin = ep_rdmawin & 0xffff;
	struct mx_endpt_state *ep = is->es[epnum];
	mcp_rdma_win_t *win = ep ? ep->mcp_rdma_windows + rdmawin : 0;
	mx_host_dma_win_t *hc = ep ? ep->host_rdma_windows[rdmawin].win : 0;

	if (ep && hc && (index + 1) * MX_ADDRS_PER_VPAGE < hc->table.nb_entries) {
	  struct mx_dma_table *tbl;
	  tbl =mx_dma_table_for_seg(hc, (index + 1) * MX_ADDRS_PER_VPAGE);
	  win->next.high = htonl(tbl->pin.dma.high);
	  MX_STBAR();
	  win->next.low = htonl(tbl->pin.dma.low);
	} else {
	  MX_INFO(("Spurious next_dma_dir request, (%d,%d,%d), ep=%p, "
		   "rdmawin=%p,total_descs=%d\n",
		   epnum, rdmawin, index, ep, hc, hc ? hc->table.nb_entries : -1));
	}
	
      }
      break;

    case MX_MCP_INTR_LINK_CHANGE:
      {
	int port = ntohl(is->intr.q[intrq][slot].data0);
	if (is->intr.q[intrq][slot].data1 != 0)
	  is->link_state |= (1 << port);
	else
	  is->link_state &= ~(1 << port);
      }
      mx_ether_link_change_notify(is);
      break;
	     
    default:
      MX_WARN(("Board %d: slot %d: unknown intr type 0x%x, disabling IRQ\n", 
	       is->id, slot, type));
      mx_dump_interrupt_queues(is);
      is->board_ops.disable_interrupt(is);
      return 0;
    }
    
    /* clear the interrupt type  */
    is->intr.q[intrq][slot].type = 0;

    if (is->intr.q[intrq][slot].flag != 0) {
      if (!claimed) {
	is->board_ops.claim_interrupt(is);
	MX_STBAR();
      }
      is->intr.slot = 0;
      is->intr.q[intrq][slot].flag = 0;
      is->intr.intrq = ((intrq + 1) & 1);
      if (call_eth_send_done)
	mx_ether_tx_done(is, ntohl(eth_send_done_slot));
      return 1;
    }
  }
  MX_WARN(("No end-of-queue intr flag found (slot %d/%d, seqnum %d)\n"
	   "\tdisabling IRQ\n", 
	   slot, maxslots, is->intr.seqnum));
  mx_dump_interrupt_queues(is);
  is->board_ops.disable_interrupt(is);
  return 0;
}

static int
mx_pin(mx_endpt_state_t *es, const mx_uaddr_t in)
{
  int nsegs, status;
  uint32_t handle_id;
  mx_reg_seg_t *segs = NULL;
  mx_reg_t pin_struct, *ps;

  status = mx_copyin(in, &pin_struct, sizeof(pin_struct), es->is_kernel);
  if (status)
    return status;

  ps = &pin_struct;
  nsegs = ps->nsegs;
  handle_id = ps->rdma_id;

  if (nsegs > MX_MAX_SEGMENTS) {
    MX_WARN (("mx_pin: too many segments\n"));
    return E2BIG;
  }

  if (handle_id >= mx_max_rdma_windows) {
    MX_WARN (("MX__PIN: Handle %d too large\n", handle_id));
    return EINVAL;
  }

  if (nsegs > 1) {
    segs = mx_kmalloc(nsegs * sizeof(*segs), 0);
    if (segs == NULL) {
      return ENOMEM;
    }
    status = mx_copyin((mx_uaddr_t) ps->segs.vaddr,
		       segs, nsegs*sizeof(*segs),
		       es->is_kernel);
    if (status) {
      goto abort_with_segs;
    }
    status = mx_register(es, handle_id, nsegs, segs,  ps->memory_context);
  } else {
    status = mx_register(es, handle_id, nsegs, &ps->segs, ps->memory_context);
  }
  /* end fall through the same code used for abort */
  
 abort_with_segs:
  if (segs)
    mx_kfree(segs);
  mx_assert(status != 0 ||
	    MX_PIO_READ(&(es->mcp_rdma_windows[handle_id].addr.low)) != MX_DMA_INVALID_ENTRY);
  return status;
}

#if MX_OS_WINNT
#define WINDOWS_BOUNDS_CHECK_COPY(len, limit) do       \
  if (len > limit)                                     \
    {                                                  \
      status = EINVAL;                                 \
      goto abort;                                      \
    }                                                  \
  while (0);
#else
#define WINDOWS_BOUNDS_CHECK_COPY(len, limit)
#endif

int
mx_endptless_ioctl(uint32_t cmd, const mx_uaddr_t in, uint32_t privileged,
		   uint32_t is_kernel)
{
  int status = 0;
  mx_instance_state_t *is;
  mx_uaddr_t out = in;
  uint32_t output_bytes;
  MX_VAR_MAY_BE_UNUSED (out);
  MX_VAR_MAY_BE_UNUSED (output_bytes);

  privileged |= mx_security_disabled;

  switch (cmd) {

  case MX_GET_NUM_PORTS:
    {
      uint32_t unit;

      status = mx_copyin(in, &unit, sizeof(unit), is_kernel);
      if (status) {
	goto abort_with_nothing;
      }

      is = mx_get_instance(unit);
      if (!is) {
	status = ENODEV;
	goto abort_with_nothing;
      }
      status = mx_copyout(&is->num_ports, out, sizeof(is->num_ports), is_kernel);
      mx_release_instance(is);
    }
    break;

  case MX_GET_LOGGING:
    {
      mx_get_logging_t t;

      status = mx_copyin(in, &t, sizeof(t), is_kernel);
      if (status) {
	goto abort_with_nothing;
      }

      is = mx_get_instance(t.board_number);
      if (!is) {
	status = ENODEV;
	goto abort_with_nothing;
      }

      status = mx_get_logging(is, t.size, (mx_uaddr_t)t.buffer,
			      is_kernel);
      mx_release_instance(is);
    }
    break;
      
  case MX_GET_COUNTERS:
  case MX_CLEAR_COUNTERS:
  case MX_GET_IRQ_COUNTERS:
    {
      uint32_t board_num;

      status = mx_copyin(in, &board_num, sizeof(board_num), is_kernel);
      if (status) {
	goto abort_with_nothing;
      }

      is = mx_get_instance(board_num);
      if (!is) {
	status = ENODEV;
	goto abort_with_nothing;
      }

      if (cmd == MX_CLEAR_COUNTERS) {
	mx_sync_t tmp_sync;
	if (!privileged) {
	  status = EPERM;
	  goto abort_with_is;
	}
	mx_sync_init(&tmp_sync, is, 0, "clear counters temp sync");
	status = mx_lanai_command(is, MX_MCP_CMD_RESET_COUNTERS, 0,
				  0, 0, &board_num, &tmp_sync);
	mx_sync_destroy(&tmp_sync);
      } else if(cmd == MX_GET_COUNTERS) {
      	status = mx_get_counters(is, out, is_kernel);
      } else if (cmd == MX_GET_IRQ_COUNTERS && is->intr.cnt != 0) {
	mx_irq_counters_t irq;

	irq.count = is->intr.cnt;
	irq.events = is->intr.seqnum;
	irq.spurious = is->intr.spurious;
	status = mx_copyout(&irq, out, sizeof(irq), is_kernel);
      } else {
	status = ENOTTY;
      }
      mx_release_instance(is);
    }
    break;

  case MX_CLEAR_PEER_NAMES:
    if (!privileged) {
      status = EPERM;
      goto abort_with_nothing;
    }
    mx_clear_peer_names();
    break;

  case MX_CRASHDUMP:
    {
      mx_crashdump_t dump;
      uint32_t spin_offset;
      mx_sync_t tmp_sync;
      int running, i;
      volatile uint32_t *pause_addr = NULL;
      uint32_t *registers, num_registers;

      if (!privileged) {
	status = EPERM;
	goto abort_with_nothing;
      }
      status = mx_copyin(in, &dump, sizeof(dump), is_kernel);
      if (status) {
	goto abort_with_nothing;
      }

      is = mx_get_instance(dump.board_number);
      if (!is) {
	status = ENODEV;
	goto abort_with_nothing;
      }

      running = !mx_is_dead(is);
      if (running) {
	/* Allocate space to save a shadow of the registers,
	   and pause the lanai so that we can have a consistent
	   view */

	is->board_ops.dump_registers(is, NULL, &num_registers);
	registers = mx_kmalloc(sizeof(*registers) * num_registers,
			       MX_WAITOK|MX_MZERO);
	if (registers == NULL) {
	  MX_WARN(("Could not allocate space to dump registers\n"));
	  goto abort_with_is;
	}
	if (mx_is_dead(is)) {
	  MX_WARN(("Dump raced with lanai crash, try again\n"));
	  status = ENXIO;
	  mx_kfree(registers);
	  goto abort_with_is;
	}
	if (is->board_type != MX_BOARD_TYPE_Z) {
	  mx_sync_init(&tmp_sync, is, 0, "crashdump temp sync");
	  status = mx_lanai_command(is, MX_MCP_CMD_PAUSE, 0,
				    0, 0, &spin_offset, &tmp_sync);
	  if (status) {
	    MX_WARN(("MX_MCP_CMD_PAUSE failed, status = %d\n",  status));
	    mx_sync_destroy(&tmp_sync);
	    goto abort_with_is;
	  }
	  pause_addr = (uint32_t *)((char *)is->lanai.sram + spin_offset);
	  *pause_addr = htonl(1); 
	  MX_STBAR();
	  
	  /* wait a small amount of time to enter dispatch handler */
	  i = 0;
	  do {
	    mx_sleep(&tmp_sync, MX_SMALL_WAIT, MX_SLEEP_NOINTR);
	    i++;
	  } while (i < 20 && (2 != ntohl(*pause_addr)));
	  
	  /* Give up if it takes too long.. */
	  if (2 != ntohl(*pause_addr))
	    MX_WARN(("mx%d: unable to pause Lanai for dump (%d)\n", 
		     is->id, (int)ntohl(*pause_addr)));
	  mx_sync_destroy(&tmp_sync);
	}
	  
	/* obtain the registers */
	is->board_ops.dump_registers(is, registers, &num_registers);

      } else {
	/* The registers were saved when the mcp crashed, so
	   just use them.  The lanai has been frozen, so there
	   is no need to pause it */
	
	registers = is->saved_state.registers;
	num_registers = is->saved_state.num_registers;
	if (registers == NULL) {
	  MX_WARN(("Dead lanai, but saved registers are null?!?\n"));
	  goto abort_with_is;
	}
      }

      dump.sram_size = is->sram_size;
      dump.isr = registers[1];	/* ISR will always be 1st register */
      status = mx_copyout(&dump, out, sizeof (dump), is_kernel);
      if (status) {
	mx_mutex_exit(&is->sync);
	goto abort_with_is;
      }

      status = mx_copyout(registers, (mx_uaddr_t)dump.sram,
			  num_registers * sizeof(*registers), is_kernel);
      status = mx_copyout((void *)is->lanai.sram, 
			  (mx_uaddr_t)dump.sram + num_registers * sizeof(*registers),
			  is->sram_size, is_kernel);

      if (running) {
	mx_kfree(registers);
	if (is->board_type != MX_BOARD_TYPE_Z) {
	  *pause_addr = 0;
	  MX_STBAR();
	}
      }

      if (status) {
	goto abort_with_is;
      }

      mx_release_instance(is);
    }
    break;      

  case MX_GET_INSTANCE_COUNT: 
    status = mx_copyout(&mx_num_instances, out, sizeof (mx_num_instances), is_kernel);
    break;

  case MX_GET_MAX_INSTANCE:
    status = mx_copyout(&mx_max_instance, out, sizeof (mx_max_instance), is_kernel);
    break;

  case MX_GET_MAX_ENDPOINTS:
    status = mx_copyout(&mx_max_endpoints, out, sizeof (mx_max_endpoints), is_kernel);
    break;
    
  case MX_GET_SMALL_MESSAGE_THRESHOLD:
    status = mx_copyout(&mx_small_message_threshold, out, 
			sizeof(mx_small_message_threshold), is_kernel);
    break;
    
  case MX_GET_MEDIUM_MESSAGE_THRESHOLD:
    status = mx_copyout(&mx_medium_message_threshold, out, 
			sizeof(mx_medium_message_threshold), is_kernel);
    break;

  case MX_GET_NIC_ID:
    {
      mx_get_nic_id_t x;
      int i;
      
      status = mx_copyin(in, &x, sizeof(x), is_kernel);
      if (status) {
	goto abort_with_nothing;
      }
      
      is = mx_get_instance(x.board_number);
      if (!is) {
	status = ENODEV;
	goto abort_with_nothing;
      }
      x.nic_id = 0;
      for (i=0;i<6;i++) {
	x.nic_id = (x.nic_id << 8) + is->mac_addr[i];
      }
      status = mx_copyout(&x, out, sizeof (x), is_kernel);
      mx_release_instance(is);
    }
    break;
    
  case MX_GET_MAX_SEND_HANDLES:
    status = mx_copyout(&mx_max_send_handles, out, 
			sizeof (mx_max_send_handles), is_kernel);
    break;

  case MX_GET_MAX_RDMA_WINDOWS:
    status = mx_copyout(&mx_max_rdma_windows, out, 
			sizeof (mx_max_rdma_windows), is_kernel);
    break;

  case MX_GET_COUNTERS_STRINGS:
    {
      mx_get_counters_strings_t s;
      int i;
      mx_uaddr_t p;
      uint32_t mx_mcp_counters_count;
      const char **mx_mcp_counters;
      char tmp[MX_MAX_STR_LEN];

      if ((status = mx_copyin(in, &s, sizeof (s), is_kernel))) {
	goto abort_with_nothing;
      }

      is = mx_get_instance(s.board_number);
      if (!is) {
	status = ENODEV;
	goto abort_with_nothing;
      }

      status = mx_mcpi.get_counters(is->board_type, &mx_mcp_counters, 
				    &mx_mcp_counters_count);
      if (status) {
	MX_WARN (("Don't know which counters we have\n"));
	status = ENXIO;
	goto abort_with_is;
      }

      if (s.count < mx_mcp_counters_count) {
	s.count = mx_mcp_counters_count;
	status = mx_copyout(&s.count, out, sizeof (s.count), is_kernel);
      } else {
	s.count = mx_mcp_counters_count;
	if ((status = mx_copyout(&s.count, out, sizeof (s.count), is_kernel))) {
	  goto abort_with_is;
	}
	p = (mx_uaddr_t)out + offsetof(mx_get_counters_strings_t, label);
	for (i = 0; i < mx_mcp_counters_count; ++i, p+=MX_MAX_STR_LEN) {
	  strncpy(tmp, mx_mcp_counters[i], MX_MAX_STR_LEN-1); 
	  tmp[MX_MAX_STR_LEN-1] = '\0';
	  if ((status = mx_copyout(tmp, p, MX_MAX_STR_LEN, is_kernel))) {
	    goto abort_with_is;
	  }
	}
      }
      mx_release_instance(is);
    }
    break;

  case MX_GET_LOGGING_STRINGS:
    {
      mx_get_logging_strings_t s;
      int i;
      mx_uaddr_t p;
      uint32_t mx_mcp_logstr_count;
      const char **mx_mcp_logstr;
      char tmp[MX_MAX_STR_LEN];

      if ((status = mx_copyin(in, &s, sizeof (s), is_kernel))) {
	goto abort_with_nothing;
      }

      is = mx_get_instance(s.board_number);
      if (!is) {
	status = ENODEV;
	goto abort_with_nothing;
      }

      mx_mcp_logstr_count = mx_mcpi.get_log_events_count(is->board_type);
      if (!mx_mcp_logstr_count) {
	MX_WARN (("Don't know how many logging events we have\n"));
	status = ENXIO;
	goto abort_with_is;
      }

      if (s.count < mx_mcp_logstr_count) {
	s.count = mx_mcp_logstr_count;
	status = mx_copyout(&s.count, out, sizeof (s.count), is_kernel);
      } else {
	s.count = mx_mcp_logstr_count;
	if ((status = mx_copyout(&s.count, out, sizeof (s.count), is_kernel))) {
	  goto abort_with_is;
	}

	mx_mcp_logstr = (const char **)mx_mcpi.get_log_events(is->board_type);
	p = (mx_uaddr_t)out + offsetof(mx_get_logging_strings_t, label);
	for (i = 0; i < mx_mcp_logstr_count; ++i, p+=MX_MAX_STR_LEN) {
	  strncpy(tmp, mx_mcp_logstr[i], MX_MAX_STR_LEN-1); 
	  tmp[MX_MAX_STR_LEN-1] = '\0';
	  if ((status = mx_copyout(tmp, p, MX_MAX_STR_LEN, is_kernel))) {
	    goto abort_with_is;
	  }
	}
      }
      mx_release_instance(is);
    }
    break;

  case MX_NIC_ID_TO_BOARD_NUM:
    {
      mx_nic_id_to_board_num_t x;
      int i;

      if ((status = mx_copyin(in, &x, sizeof (x), is_kernel))) {
	goto abort_with_nothing;
      }
      mx_mutex_enter(&mx_global_mutex);
      for (i = 0; i < mx_max_instance; ++i) {
	is = mx_instances[i];
	if (!is)
	  continue;
	if ((((uint64_t)is->mac_addr[0] << 40) |
	     ((uint64_t)is->mac_addr[1] << 32) |
	     ((uint64_t)is->mac_addr[2] << 24) |
	     ((uint64_t)is->mac_addr[3] << 16) |
	     ((uint64_t)is->mac_addr[4] <<  8) |
	     ((uint64_t)is->mac_addr[5])) == x.nic_id) {
	  break;
	}
      }
      if (i >= mx_max_instance) {
	status = ENODEV;
	mx_mutex_exit(&mx_global_mutex);
	goto abort_with_nothing;
      }
      mx_mutex_exit(&mx_global_mutex);
      x.board_number = i;
      status = mx_copyout(&x, out, sizeof(x), is_kernel);
    }
    break;

  case MX_NIC_ID_TO_PEER_INDEX:
    {
      mx_lookup_peer_t x;
      uint32_t mac_low32;
      uint16_t mac_high16;
      mx_peer_hash_t *hash;

      if ((status = mx_copyin(in, &x, sizeof (x), is_kernel))) {
	goto abort_with_nothing;
      }
      is = mx_get_instance(x.board_number);
      if (!is) {
	status = ENODEV;
	goto abort_with_nothing;
      }

      mac_high16 = (uint16_t)((x.nic_id >> 32) & 0xffff);
      mac_low32 = (uint32_t)(x.nic_id & 0xffffffff);
      hash = mx_peer_lookup(mac_high16, mac_low32);
      if (hash) {
	/* found in hash table */
	x.index = hash->index;
	status = mx_copyout(&x, out, sizeof(x), is_kernel);
      } else {
	status = ESRCH;
      }
      mx_release_instance(is);
    }
    break;

  case MX_PEER_INDEX_TO_NIC_ID:
    {
      mx_lookup_peer_t x;
      
      if ((status = mx_copyin(in, &x, sizeof (x), is_kernel))) {
	goto abort_with_nothing;
      }
      if (x.index > mx_biggest_peer) {
	status = EINVAL;
      } else {
	x.nic_id = (mx_peer_table[x.index].mac_low32 + 
		    ((uint64_t)mx_peer_table[x.index].mac_high16 << 32));
	status = mx_copyout(&x, out, sizeof(x), is_kernel);
      }
    }
    break;

  case MX_GET_PEER_FORMAT:
    {
      mx_get_peer_format_t x;

      x.sizeof_peer_t = (uint32_t)sizeof(mx_peer_t);
      x.offset_of_type = offsetof(mx_peer_t, type);
      x.offset_of_node_name = offsetof(mx_peer_t, node_name);
      status = mx_copyout(&x, out, sizeof(x), is_kernel);
    }
    break;

  case MX_GET_ROUTE_SIZE:
    {
      uint32_t x;
      mx_instance_state_t *is;

      /* user sends in board number */
      if ((status = mx_copyin(in, &x, sizeof (x), is_kernel))) {
	goto abort_with_nothing;
      }
      is = mx_get_instance(x);
      if (!is) {
	status = ENODEV;
	goto abort_with_nothing;
      }
      x = (uint32_t)is->routes[0].block_size;
      mx_release_instance(is);
      status = mx_copyout(&x, out, sizeof(x), is_kernel);
    }
    break;

  case MX_GET_PEER_TABLE:
    status = mx_copyout(mx_peer_table, out, 
			(mx_biggest_peer + 1) * sizeof(mx_peer_table[0]),
			is_kernel);
    break;

  case MX_GET_ROUTE_TABLE:
    {
      mx_get_route_table_t x;
      mx_instance_state_t *is;
      mx_routes_t *routes;
      mx_uaddr_t uaddr;
      int i;

      if ((status = mx_copyin(in, &x, sizeof (x), is_kernel))) {
	goto abort_with_nothing;
      }
      is = mx_get_instance(x.board_number);
      if (!is) {
	status = ENODEV;
	goto abort_with_nothing;
      }
      uaddr = (mx_uaddr_t)x.routes;
      for (i = 0; i < is->num_ports; i++) {
	routes = &is->routes[i];
	status = mx_copyout(routes->host_table, uaddr, 
			    (mx_biggest_peer + 1) * routes->block_size, 
			    is_kernel);
	uaddr += routes->block_size * (mx_max_nodes + 1);
      }
      mx_release_instance(is);
    }
    break;


  case MX_GET_SERIAL_NUMBER:
    {
      uint32_t board_number;

      if ((status = mx_copyin(in, &board_number,
			      sizeof(board_number), is_kernel))) {
	goto abort_with_nothing;
      }
      is = mx_get_instance(board_number);
      if (!is) {
	status = ENODEV;
	goto abort_with_nothing;
      }
      status = mx_copyout(&is->lanai.serial, out,
			  sizeof(is->lanai.serial), is_kernel);
      mx_release_instance(is);
    }
    break;

  case MX_GET_OPENER:
    {
      mx_get_opener_t x;
      mx_endpt_state_t *es;
      int raw = 0;
      int ether = 0;

      if ((status = mx_copyin(in, &x, sizeof(x), is_kernel))) {
	goto abort_with_nothing;
      }

      switch ((int) x.endpt_number) {
      case -2: 
	ether = 1;
	break;
      case -1:
	raw = 1;
	break;
      default:
	if (x.endpt_number >= mx_max_endpoints) {
	  status = EINVAL;
	  goto abort_with_nothing;
	}
	break;
      }
      is = mx_get_instance(x.board_number);
      if (!is) {
	status = ENODEV;
	goto abort_with_nothing;
      }
      mx_mutex_enter(&is->sync);
      if (raw)
	es = is->raw.es;
      else if (ether)
	es = NULL;
      else 
	es = is->es[x.endpt_number];

      x.closed = 0;
      if (es) {
	bcopy(&es->opener, &x.opener, sizeof(x.opener));
      } else if (!ether || !is->ether_is_open) {
	  x.closed = 1;
      }
      mx_mutex_exit(&is->sync);
      status = mx_copyout(&x, out, sizeof(x), is_kernel);
      mx_release_instance(is);
    }
    break;

  case MX_GET_MAX_PEERS:
    status = mx_copyout(&mx_max_nodes, out, sizeof(mx_max_nodes), is_kernel);
    break;

  case MX_NIC_ID_TO_HOSTNAME:
    {
      mx_uaddr_t va;
      mx_nic_id_hostname_t x;
      uint32_t mac_low32;
      uint16_t mac_high16;
      int peer_index;
      size_t len;
      mx_peer_hash_t * hash;

      peer_index = 0;
      if ((status = mx_copyin(in, &x, sizeof (x), is_kernel))) {
	goto abort_with_nothing;
      }

      mac_high16 = (uint16_t)((x.nic_id >> 32) & 0xffff);
      mac_low32 = (uint32_t)(x.nic_id & 0xffffffff);
      hash = mx_peer_lookup(mac_high16, mac_low32);
      if (hash) {
	/* found in hash table */
	peer_index = hash->index;
	va = (mx_uaddr_t)x.va;
	len = MIN(sizeof(mx_peer_table[peer_index].node_name),
		  (size_t) x.len);
			 
	status = mx_copyout(mx_peer_table[peer_index].node_name, va, len, is_kernel);
      } else {
	status = ESRCH;
      }
    }
    break;

  case MX_HOSTNAME_TO_NIC_ID:
    {
      mx_peer_t peer;
      mx_nic_id_hostname_t x;
      size_t len;
      mx_uaddr_t va;

      if ((status = mx_copyin(in, &x, sizeof (x), is_kernel))) {
	goto abort_with_nothing;
      }
      len = MIN(sizeof(peer.node_name), (size_t)x.len);
      va = (mx_uaddr_t)x.va;
      if ((status = mx_copyin(va, peer.node_name, len, is_kernel))) {
	goto abort_with_nothing;
      }

      /* ensure that user-supplied buffer is null terminated */
      peer.node_name[sizeof(peer.node_name) -1] = '\0';

      status = mx_peer_from_hostname(&peer);
      if (!status) {
	x.nic_id = ((uint64_t)peer.mac_high16 << 32) | peer.mac_low32;
	status = mx_copyout(&x, out, sizeof(x), is_kernel);
      }
    }
    break;

  case MX_SET_HOSTNAME:
    {
      mx_set_hostname_t x;
      mx_uaddr_t va;
      char *name;

      if (!privileged) {
	status = EPERM;
	goto abort_with_nothing;
      }
      if ((status = mx_copyin(in, &x, sizeof (x), is_kernel))) {
	goto abort_with_nothing;
      }
      if (x.len >= sizeof(mx_default_hostname)) {
	status = ERANGE;
	goto abort_with_nothing;
      }
      is = mx_get_instance(x.board_number);
      if (!is) {
	status = ENODEV;
	goto abort_with_nothing;
      }
      name = mx_kmalloc(x.len + 1, MX_WAITOK);
      if (!name) {
	status = ENOMEM;
	goto abort_with_is;
      }
      va = (mx_uaddr_t)x.va; 
      status = mx_copyin(va, name, x.len, is_kernel);
      if (status) {
	mx_kfree(name);
	goto abort_with_is;
      }
      name[x.len] = '\0';
      mx_set_hostname(is, name);
      mx_kfree(name);
      mx_release_instance(is);
    }
    break;

  case MX_GET_MAX_EVENT_SIZE:
    {
      uint32_t max_event_size;
      status = mx_mcpi.get_param(0, 0, "MX_MCP_MAX_EVENT_SIZE",
				 &max_event_size);
      if (status)
	goto abort_with_nothing;

      status = mx_copyout(&max_event_size, out, sizeof(max_event_size), is_kernel);
    }
    break;

  case MX_GET_CACHELINE_SIZE:
    {
      status = mx_copyout(&mx_cacheline_size, out, sizeof(mx_cacheline_size), is_kernel);
    }
    break;

  case MX_RUN_DMABENCH:
    {
      mx_dmabench_t x;

      status = mx_copyin(in, &x, sizeof(x), is_kernel);
      if (status)
	goto abort_with_nothing;

      is = mx_get_instance(x.board_number);
      if (!is) {
	status = ENODEV;
	goto abort_with_nothing;
      }
      
      status = mx_run_dmabench(is, &x);
      if (!status)
	status = mx_copyout(&x, out, sizeof(x), is_kernel);

      mx_release_instance(is);
    }
    break;

  case MX_GET_BOARD_STATUS:
    {
      uint32_t board_num, board_status;

      status = mx_copyin(in, &board_num, sizeof(board_num), is_kernel);
      if (status) {
	goto abort_with_nothing;
      }

      is = mx_get_instance(board_num);
      if (!is) {
	status = ENODEV;
	goto abort_with_nothing;
      }
      board_status = is->saved_state.reason;
      status = mx_copyout(&board_status, out, sizeof(board_status), is_kernel);
      mx_release_instance(is);
    }
    break;
  case MX_GET_CPU_FREQ:
    {
      uint32_t board_num;

      status = mx_copyin(in, &board_num, sizeof(board_num), is_kernel);
      if (status) {
	goto abort_with_nothing;
      }

      is = mx_get_instance(board_num);
      if (!is) {
	status = ENODEV;
	goto abort_with_nothing;
      }
      status = mx_copyout(&is->cpu_freq, out, sizeof(is->cpu_freq), is_kernel);
      mx_release_instance(is);
    }
    break;

  case MX_GET_PCI_FREQ:
    {
      uint32_t board_num;

      status = mx_copyin(in, &board_num, sizeof(board_num), is_kernel);
      if (status) {
	goto abort_with_nothing;
      }

      is = mx_get_instance(board_num);
      if (!is) {
	status = ENODEV;
	goto abort_with_nothing;
      }
      status = mx_copyout(&is->pci_freq, out, sizeof(is->pci_freq), is_kernel);
      mx_release_instance(is);
    }
    break;

  case MX_GET_MIN_LIGHT_ENDPOINTS:
    status = mx_copyout(&mx_min_light_endpoints, out, sizeof (mx_min_light_endpoints), is_kernel);
    break;

  case MX_GET_MAX_LIGHT_ENDPOINTS:
    status = mx_copyout(&mx_max_light_endpoints, out, sizeof (mx_max_light_endpoints), is_kernel);
    break;

  case MX_GET_INTR_COAL:
    {
      mx_intr_coal_t x;

      status = mx_copyin(in, &x, sizeof(x), is_kernel);
      if (status) {
	goto abort_with_nothing;
      }

      is = mx_get_instance(x.board_number);
      if (!is) {
	status = ENODEV;
	goto abort_with_nothing;
      }
      status = mx_mcpi.get_param(is->id, is->lanai.sram, "intr_coal_delay",
				 &x.delay);
      if (status)
	goto abort_with_is;

      status = mx_copyout(&x, out, sizeof(x), is_kernel);
      mx_release_instance(is);
      
    }
    break;
  case MX_SET_INTR_COAL:
    {
      mx_intr_coal_t x;

      if (!privileged) {
	status = EPERM;
	goto abort_with_nothing;
      }

      status = mx_copyin(in, &x, sizeof(x), is_kernel);
      if (status) {
	goto abort_with_nothing;
      }

      if (x.delay == 0 || x.delay > 1000) {
	status = ERANGE;
	goto abort_with_nothing;
      }

      is = mx_get_instance(x.board_number);
      if (!is) {
	status = ENODEV;
	goto abort_with_nothing;
      }
      status = mx_mcpi.set_param(is->id, is->lanai.sram, "intr_coal_delay",
				 x.delay);
      if (status)
	goto abort_with_is;

      mx_release_instance(is);
      
    }
    break;
  case MX_GET_MAPPER_STATE:
    {
      mx_mapper_state_t x;
      unsigned int port;

      status = mx_copyin(in, &x, sizeof(x), 0);
      if (status)
	goto abort_with_nothing;
      is = mx_get_instance(x.board_number);
      if (!is) {
	status = ENODEV;
	goto abort_with_nothing;
      }

      port = (unsigned int) x.iport;
      if (port >= is->num_ports) {
	status = EINVAL;
	goto abort_with_is;
      }
      status = mx_copyout(&is->raw.mapper_state[port], out, sizeof(x), is_kernel);
      mx_release_instance(is);
    }
    break;

  case MX_RAW_GET_PARAMS:
    {
      mx_raw_params_t x;
      
      status = mx_copyin(in, &x, sizeof(x), is_kernel);
      if (status)
	goto abort_with_nothing;
      is = mx_get_instance(x.board_number);
      if (!is) {
	status = ENODEV;
	goto abort_with_nothing;
      }

      x.raw_mtu = MX_RAW_BYTES;
      x.raw_max_route = MX_RAW_MAXROUTE;
      x.raw_num_tx_bufs = is->raw.max_tx_idx;
      if (x.raw_num_tx_bufs == 0)
	x.raw_num_tx_bufs = is->kreqq_max_index;
      x.raw_num_tx_bufs -= 1;
      status = mx_copyout(&x, out, sizeof(x), is_kernel);
      mx_release_instance(is);
    }
    break;

  case MX_REMOVE_PEER:
    {
      mx_raw_destination_t d;

      if (!privileged) {
	status = EPERM;
	goto abort_with_nothing;
      }

      status = mx_copyin(in, &d, sizeof(d), is_kernel);
      if (status != 0)
	goto abort_with_nothing;
      status = mx_peer_remove(d.mac_high16, d.mac_low32);
    }
    break;

  case MX_GET_LINK_STATE:
    {
      uint32_t board_num;

      status = mx_copyin(in, &board_num, sizeof(board_num), is_kernel);
      if (status) {
	goto abort_with_nothing;
      }

      is = mx_get_instance(board_num);
      if (!is) {
	status = ENODEV;
	goto abort_with_nothing;
      }
      status = mx_copyout(&is->link_state, out, sizeof(is->link_state), 
			  is_kernel);
      mx_release_instance(is);
    }
    break;
    
  case MX_GET_PRODUCT_CODE:
  case MX_GET_PART_NUMBER:
    {
      mx_get_eeprom_string_t x;
      char *unknown = "unknown";
      char *ptr;
      int len;

      status = mx_copyin(in, &x, sizeof(x), is_kernel);
      if (status) {
	goto abort_with_nothing;
      }

      is = mx_get_instance(x.board_number);
      if (!is) {
	status = ENODEV;
	goto abort_with_nothing;
      }
      if (cmd == MX_GET_PRODUCT_CODE)
	ptr = is->lanai.product_code;
      else
	ptr = is->lanai.part_number;

      if (ptr == NULL) {
	ptr = unknown;
      }

      len = strlen(ptr) + 1;
      if (len > MX_MAX_STR_LEN)
	len = MX_MAX_STR_LEN;

      status = mx_copyout(ptr, (mx_uaddr_t)x.buffer, len, is_kernel);
      mx_release_instance(is);
    }
    break;
  case MX_GET_VERSION:
    {
      mx_get_version_t x;
      x.driver_api_magic = MX_DRIVER_API_MAGIC;
      strncpy(x.version_str,MX_VERSION_STR, sizeof(x.version_str));
      strncpy(x.build_str,MX_BUILD_STR, sizeof(x.build_str));
      x.version_str[sizeof(x.version_str) - 1] = 0;
      x.build_str[sizeof(x.build_str) - 1] = 0;
      status = mx_copyout(&x, out, sizeof x, is_kernel);
    }
    break;

  case MX_GET_SRAM_SIZE:
    {
      mx_get_board_val_t x;
      status = mx_copyin(in, &x, sizeof(x), is_kernel);
      if (status) {
	goto abort_with_nothing;
      }

      is = mx_get_instance(x.board_number);
      if (!is) {
	status = ENODEV;
	goto abort_with_nothing;
      }
      x.val = is->sram_size;
      status = mx_copyout(&x, out, sizeof x, is_kernel);
      mx_release_instance(is);

    }
    break;

  case MX_GET_DUMP_REG_COUNT:
    
    {
      mx_get_board_val_t x;
      status = mx_copyin(in, &x, sizeof(x), is_kernel);
      if (status) {
	goto abort_with_nothing;
      }

      is = mx_get_instance(x.board_number);
      if (!is) {
	status = ENODEV;
	goto abort_with_nothing;
      }
      is->board_ops.dump_registers(is, NULL, &x.val);
      status = mx_copyout(&x, out, sizeof x, is_kernel);
      mx_release_instance(is);

    }
    break;

  case MX_GET_BOARD_TYPE:
    {
      mx_get_board_val_t x;
      status = mx_copyin(in, &x, sizeof(x), is_kernel);
      if (status) {
	goto abort_with_nothing;
      }

      is = mx_get_instance(x.board_number);
      if (!is) {
	status = ENODEV;
	goto abort_with_nothing;
      }
      x.val = is->board_type;
      status = mx_copyout(&x, out, sizeof x, is_kernel);
      mx_release_instance(is);

    }
    break;

  case MX_PCI_CFG_READ:
  case MX_PCI_CFG_WRITE:
    {
      uint8_t val8;
      uint16_t val16;
      uint32_t val32;
      mx_pci_cfg_t x;

      if (!privileged) {
	status = EPERM;
	goto abort_with_nothing;
      }
      status = mx_copyin(in, &x, sizeof(x), is_kernel);
      if (status) {
	goto abort_with_nothing;
      }

      is = mx_get_instance(x.board_number);
      if (!is) {
	status = ENODEV;
	goto abort_with_nothing;
      }
      if (cmd == MX_PCI_CFG_READ) {
	switch (x.width) {
	case 1:
	  status = mx_read_pci_config_8(is, x.offset, &val8);
	  x.val = val8;
	  break;
	case 2:
	  status = mx_read_pci_config_16(is, x.offset, &val16);
	  x.val = val16;
	  break;
	case 4:
	  status = mx_read_pci_config_32(is, x.offset, &val32);
	  x.val = val32;
	  break;
	default:
	  status = EINVAL;
	}
      } else {
	switch (x.width) {
	case 1:
	  status = mx_write_pci_config_8(is, x.offset, x.val);
	  break;
	case 2:
	  status = mx_write_pci_config_16(is, x.offset, x.val);
	  break;
	case 4:
	  status = mx_write_pci_config_32(is, x.offset, x.val);
	  break;
	default:
	  status = EINVAL;
	}
      }
      if (!status)
	status = mx_copyout(&x, out, sizeof x, is_kernel);
      mx_release_instance(is);
    }
    break;

  default:
    status = ENOTTY;
  }
  
  return status;

 abort_with_is:
  mx_release_instance(is);
 abort_with_nothing:
  return status;
}


int
mx_common_ioctl(mx_endpt_state_t *es, uint32_t cmd, const mx_uaddr_t in)
{
  mx_uaddr_t out = in;
  uint32_t output_bytes;
  int status = 0;


  MX_VAR_MAY_BE_UNUSED (out);
  MX_VAR_MAY_BE_UNUSED (output_bytes);

  if (mx_is_dead(es->is)) {
    if (!((es->is->flags & MX_PARITY_RECOVERY) && cmd == MX_WAIT)) {
      MX_WARN(("firmware dead on board %d, ignoring ioctl\n", es->is->id));
      return EIO;
    }
  }
  switch (cmd) {

  case MX_REGISTER:
    status = mx_pin(es, in);
    break;

  case MX_DEREGISTER:
    {
      uint32_t handle;
      status = mx_copyin(in, &handle, sizeof(handle), es->is_kernel);
      if (status) {
	goto abort;
      }
      
      if (handle >= mx_max_rdma_windows) {
	MX_WARN (("MX_DEREGISTER: Handle %d too large\n", handle));
	status = EINVAL;
	goto abort;
      }
      status = mx_deregister(es, handle);
    }
    break;

  case MX_WAIT:
    /* we try to copy in a 32-bit opaque handle */
    {
      mx_wait_t x;
      mx_instance_state_t *is = es->is;
      int sleep_status;

      status = mx_copyin(in, &x, sizeof(x), es->is_kernel);
      if (status) {
	goto abort;
      }
      mx_mutex_enter(&es->sync);
      es->num_waiters++;
      if (!es->progression_timeout)
	es->progression_timeout = x.timeout;
      do {
	unsigned time_slot = es->progression_timeout;
	if (time_slot >= 50)
	  time_slot = 50;
	es->progression_timeout -= time_slot;
	mx_mutex_exit(&es->sync);
	sleep_status = mx_sleep(&es->wait_sync, time_slot, MX_SLEEP_INTR);
	mx_mutex_enter(&es->sync);
      } while (sleep_status == EAGAIN && es->progression_timeout);
      es->progression_timeout = 0;
      x.mcp_wake_events = 0;
      if (sleep_status == 0 && mx_atomic_read(&es->no_mcp_req_wakeups) == 0) {
	x.mcp_wake_events = 1;
      } else if (sleep_status == 0) {
	mx_atomic_subtract(1, &es->no_mcp_req_wakeups);
      }
      x.status = MX_WAIT_STATUS_GOOD;
      if (es->parity_errors_detected != is->parity_errors_detected) {
	/* new parity error? */
	es->parity_errors_detected = is->parity_errors_detected;
	x.status = MX_WAIT_PARITY_ERROR_DETECTED;
      } else if (es->parity_errors_corrected != is->parity_errors_corrected) {
	/* finishd handling parity error? */
	es->parity_errors_corrected = is->parity_errors_corrected;
	x.status = MX_WAIT_PARITY_ERROR_CORRECTED;
      } else {
	/* endpoint did something bad, or there was an uncorrectable
	   parity error which only affects this endpoint */
	x.status = es->endpt_error;
	es->endpt_error = 0;
      }
      es->num_waiters--;
      mx_mutex_exit(&es->sync);
      status = mx_copyout(&x, out, sizeof(x), es->is_kernel);
    }
    break;

  case MX_APP_WAIT:
    {
      mx_wait_t x;
      int sleep_status;

      status = mx_copyin(in, &x, sizeof(x), es->is_kernel);
      if (status) {
	goto abort;
      }
      mx_mutex_enter(&es->sync);
      es->num_waiters++;
      es->app_waiting = 1;
      mx_mutex_exit(&es->sync);
      sleep_status = mx_sleep(&es->app_wait_sync, x.timeout, MX_SLEEP_INTR);
      mx_mutex_enter(&es->sync);
      x.mcp_wake_events = 0;
      if (sleep_status == 0 && mx_atomic_read(&es->no_mcp_req_wakeups) == 0) {
	x.mcp_wake_events = 1;
      } else if (sleep_status == 0) {
	mx_atomic_subtract(1, &es->no_mcp_req_wakeups);
      }
      if (sleep_status == 0)
	x.status = MX_WAIT_STATUS_GOOD;
      else
	x.status = MX_WAIT_TIMEOUT_OR_INTR;
      es->app_waiting = 0;
      es->num_waiters--;
      mx_mutex_exit(&es->sync);
      status = mx_copyout(&x, out, sizeof(x), es->is_kernel);
    }
    break;

  case MX_CLEAR_WAIT:
    {
      mx_mutex_enter(&es->sync);
      if (es->num_waiters != 0) {
	status = EBUSY;
      } else {
	mx_sync_reset(&es->wait_sync);
      }
      mx_mutex_exit(&es->sync);
    } break;

  case MX_WAKE:
    {
      /* used by an exiting application to alert the
	 progression thread */
	 
      mx_atomic_add(1, &es->no_mcp_req_wakeups);
      mx_wake(&es->wait_sync);
    } break;

  case MX_APP_WAKE:
    {
      /* used by progression thread to wake application 
	 thread */
      mx_atomic_add(1, &es->no_mcp_req_wakeups);
      es->app_waiting = 0;
      mx_wake(&es->app_wait_sync);
    } break;

  case MX_GET_COPYBLOCKS:
    {
      mx_get_copyblock_t desc;

      desc.sendq_offset = 0;
      desc.sendq_len = es->sendq.size;

      desc.recvq_offset = desc.sendq_offset + es->sendq.size;
      desc.recvq_len = es->recvq.size;

      desc.eventq_offset = desc.recvq_offset + es->recvq.size;
      desc.eventq_len = es->eventq.size;

      desc.user_mmapped_sram_offset = desc.eventq_offset + es->eventq.size;
      desc.user_mmapped_sram_len = es->user_mmapped_sram.size;
      desc.user_mmapped_zreq_offset = (desc.user_mmapped_sram_offset + 	
				       es->user_mmapped_sram.size);
      desc.user_mmapped_zreq_len = es->user_mmapped_zreq.size;
      desc.kernel_window_offset = (desc.user_mmapped_zreq_offset +
				   desc.user_mmapped_zreq_len);
      desc.kernel_window_len = (es->is->kernel_window ? MX_PAGE_SIZE : 0);

      /* offsets within sram */
      desc.user_reqq_offset = 0;
      status = mx_mcpi.get_param(es->is->id, es->is->lanai.sram, 
				 "MX_MCP_UREQQ_CNT",
				 &desc.user_reqq_len);
      if (status)
	goto abort;

      desc.user_dataq_offset = desc.user_reqq_len;
      status = mx_mcpi.get_param(es->is->id, es->is->lanai.sram, 
				 "MX_MCP_UDATAQ_SIZE",
				 &desc.user_dataq_len);
      
      if (status)
	goto abort;
      
      status = mx_copyout(&desc, out, sizeof(desc), es->is_kernel);
    }
    break;

  case MX_SET_MAPPER_STATE:
    {
      mx_mapper_state_t x;
      unsigned int port;

      if ((es->flags & MX_ES_RAW) == 0) {
	status = EPERM;
	goto abort;
      }
      status = mx_copyin(in, &x, sizeof(x), 0);
      if (status)
	goto abort;
      port = (unsigned int) x.iport;
      if (port >= es->is->num_ports) {
	status = EINVAL;
	goto abort;
      }
      bcopy(&x, &es->is->raw.mapper_state[port], sizeof (x));
   }
    break;

  case MX_RECOVER_ENDPOINT:
    {
      status = mx_open_mcp_endpoint(es);
    }
    break;

  case MX_SET_ROUTE_BEGIN:
  case MX_SET_ROUTE_END:
    {
      if ((es->flags & MX_ES_RAW) == 0) {
	status = EPERM;
	goto abort;
      }

      if (cmd == MX_SET_ROUTE_BEGIN)
	status = mx_kraw_set_route_begin(es);
      else
	status = mx_kraw_set_route_end(es);

    }
    break;

  case MX_SET_ROUTE:
    {
      mx_set_route_t x;

      if ((es->flags & MX_ES_RAW) == 0) {
	status = EPERM;
	goto abort;
      }
      if ((status = mx_copyin(in, &x, sizeof (x), es->is_kernel))) {
	goto abort;
      }
      status = mx_kraw_set_route(es, &x, 0);
    }
    break;

  case MX_RAW_CLEAR_ROUTES:
    {
      mx_set_route_t x;

      if ((es->flags & MX_ES_RAW) == 0) {
	status = EPERM;
	goto abort;
      }
      if ((status = mx_copyin(in, &x, sizeof(x), es->is_kernel)))
	  goto abort;
      status = mx_kraw_set_route(es, &x, 1);
    }
    break;

  case MX_RAW_GET_NEXT_EVENT:
    {
      mx_raw_next_event_t x;

      if ((es->flags & MX_ES_RAW) == 0) {
	status = EPERM;
	goto abort;
      }
      if ((status = mx_copyin(in, &x, sizeof(x), es->is_kernel)))
	  goto abort;
      status = mx_kraw_next_event(es, &x);
      if (!status)
	status = mx_copyout(&x, out, sizeof(x), es->is_kernel);
    }
    break;
  case MX_RAW_SEND:
    {
      mx_raw_send_t x;

      if ((es->flags & MX_ES_RAW) == 0) {
	status = EPERM;
	goto abort;
      }
      if ((status = mx_copyin(in, &x, sizeof(x), es->is_kernel)))
	  goto abort;
      status = mx_kraw_send(es, &x);
    }
    break;

  case MX_RAW_TICKS:
    {
      uint32_t enable;

      if ((es->flags & MX_ES_RAW) == 0) {
	status = EPERM;
	goto abort;
      }
      if ((status = mx_copyin(in, &enable, sizeof(enable), es->is_kernel)))
	  goto abort;
      status = mx_kraw_tick_change(es->is, enable);
    }
    break;

  case MX_DIRECT_GET:
    {
      mx_direct_get_t xget;
      mx_endpt_state_t *es2;
      mx_shm_seg_t src_seg, dst_seg;

      if ((status = mx_copyin(in, &xget, sizeof(xget), es->is_kernel)))
	goto abort;
      es2 = mx_get_endpoint(es->is, xget.src_endpt);
      if (!es2) {
	status = ESRCH;
	goto abort;
      }
      /* do not check src_session since the MX_DIRECT_GET ABI has been
       * distributed without the lib setting src_session and the driver
       * checking it.
       */

      src_seg.vaddr = xget.src_va;
      src_seg.len = xget.length;
      dst_seg.vaddr = xget.dst_va;
      dst_seg.len = xget.length;

      status = mx_direct_get(es, &dst_seg, 1, es2, &src_seg, 1, xget.length);
      mx_put_endpoint(es2);
      if (status)
	goto abort;
    }
    break;

  case MX_DIRECT_GETV:
    {
      mx_direct_getv_t xget;
      mx_endpt_state_t *es2;

      if ((status = mx_copyin(in, &xget, sizeof(xget), es->is_kernel))) {
	goto abort;
      }

      if (xget.dst_nsegs > MX_MAX_SEGMENTS || xget.src_nsegs > MX_MAX_SEGMENTS) {
	status = E2BIG;
	goto abort;
      }	

      es2 = mx_get_endpoint(es->is, xget.src_endpt);
      if (!es2) {
	status = ESRCH;
	goto abort;
      }
      if (es2->session_id != xget.src_session) {
	status = EPERM;
	mx_put_endpoint(es2);
	goto abort;
      }

      status = mx_direct_get(es, &xget.dst_segs, xget.dst_nsegs,
			     es2, &xget.src_segs, xget.src_nsegs, xget.length);
      mx_put_endpoint(es2);
      if (status)
	goto abort;
    }
    break;

  case MX_WAKE_ENDPOINT:
    {
      mx_wake_endpt_t xwake;
      mx_endpt_state_t *es2;
      if ((status = mx_copyin(in, &xwake, sizeof(xwake), es->is_kernel)))
	  goto abort;
      es2 = mx_get_endpoint(es->is, xwake.endpt);
      if (!es2) {
	status = ESRCH;
	goto abort;
      }
      /* no overflow possible because the number of 
	 no_mcp_req_wakeup is bounded by the shmem queue size */
      mx_atomic_add(1, &es2->no_mcp_req_wakeups);
      if (es2->app_waiting) {
	es2->app_waiting = 0;
	mx_wake(&es2->app_wait_sync);
      } else {
	mx_wake(&es2->wait_sync);
      }
      mx_put_endpoint(es2);
    }
    break;
#if MX_OS_UDRV
  case MX_WRITE_PIO_REQ:
    {
      mx_write_pio_req_t x;
      mx_copyblock_t *cb;

      status = mx_copyin(in, &x, sizeof(x), es->is_kernel);
      if (status) {
	goto abort;
      }
      cb = (es->is->board_type == MX_BOARD_TYPE_Z ?
	    &es->user_mmapped_zreq : &es->user_mmapped_sram);
      mx_always_assert(x.offset >= 0 && x.len <= 64 &&
		       x.offset + x.len <= cb->size);
      mx_pio_memcpy((char *)cb->addr + x.offset,
		  x.data, x.len, 0);
    }
    break;
#endif
  case MX_ARM_TIMER:
    {
      uint32_t ms;

      status = mx_copyin(in, &ms, sizeof(ms), es->is_kernel);
      if (status) {
	goto abort;
      }
      mx_mutex_enter(&es->sync);
      es->progression_timeout = ms;
      mx_mutex_exit(&es->sync);
    }
    break;

  default:
    status = ENOTTY;
  }

 abort:
  return status;
}


int
mx_instance_status_string(int unit, char **str, int *len)
{
  char *c;
  char mcp_status_string[64];
  mx_instance_state_t *is;
  uint32_t mcp_version, mcp_status, status;
  int ret = 0;

  is = mx_get_instance(unit);
      if (!is) {
	ret = ENODEV;
	goto abort_with_nothing;
      }

  c = mx_kmalloc(512, MX_MZERO|MX_WAITOK);
  if (c == NULL) {
    ret = ENOMEM;
    goto abort_with_is;
  }

  ret = mx_mcpi.get_param(is->id, (volatile uint8_t *)NULL, 
			     "mcp_version", &mcp_version);
  if (ret)
    goto abort_with_buf;

  if (mx_is_dead(is)) {
    status = mx_mcpi.get_param(is->id, is->lanai.sram, "mcp_status", 
			       &mcp_status);
    if (status !=0)
      mcp_status = -1;
    sprintf(mcp_status_string, "not responding (%d,%d,0x%x)", 
	    is->saved_state.reason, is->saved_state.arg, mcp_status);
  } else {
    sprintf(mcp_status_string, "running)");
  }

  sprintf(c, "mx%d: Driver 0x%x, MCP 0x%x. status:%s, uptime %ld, %d ports\n",
	  is->id, MX_MCP_DRIVER_API_VERSION, mcp_version, mcp_status_string, 
	  (long)ntohl(*is->lanai_uptime_ptr), is->num_ports);

  *len = (int)strlen(c);
  *str = c;
  mx_release_instance(is);
  return 0;


 abort_with_buf:
  mx_kfree(c);
 abort_with_is:
  mx_release_instance(is);
 abort_with_nothing:
  return ret;
}

int
mx_direct_get_common (mx_shm_seg_t *dst_segs, uint32_t dst_nsegs,
		      void * src_space, mx_shm_seg_t *src_segs, uint32_t src_nsegs,
		      uint32_t length)
{
  mx_shm_seg_t *current_src_seg, *current_dst_seg;
  uint32_t current_src_index, current_dst_index;
  uint32_t current_src_offset, current_dst_offset;
  int status;

  /* start with first src segment */
  current_src_index = 0;
  current_src_seg = &src_segs[0];
  current_src_offset = 0;
  /* start with first dst segment */
  current_dst_index = 0;
  current_dst_seg = &dst_segs[0];
  current_dst_offset = 0;

  while (current_src_index < src_nsegs && current_dst_index < dst_nsegs
	 && length > 0) {
    uint32_t chunk = length;
    /* copy until one segment ends */
    chunk = chunk < current_src_seg->len - current_src_offset
      ? chunk : current_src_seg->len - current_src_offset;
    chunk = chunk < current_dst_seg->len - current_dst_offset
      ? chunk : current_dst_seg->len - current_dst_offset;

    if (chunk) {
      status = mx_arch_copy_user_to_user(current_dst_seg->vaddr + current_dst_offset,
					 current_src_seg->vaddr + current_src_offset,
					 src_space,
					 chunk);
      if (status)
	return status;
    }

    /* update src status */
    current_src_offset += chunk;
    if (current_src_offset == current_src_seg->len) {
      current_src_seg++;
      current_src_index++;
      current_src_offset = 0;
    }
    /* update dst status */
    current_dst_offset += chunk;
    if (current_dst_offset == current_dst_seg->len) {
      current_dst_seg++;
      current_dst_index++;
      current_dst_offset = 0;
    }

    length -= chunk;
  }

  return 0;
}
